From e09612524eeb4289603562d6598f83deb32adb3e Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Thu, 14 Jul 2022 09:04:22 -0700 Subject: [PATCH 01/25] Minor changes for sessions usability --- Banyan/src/requests.jl | 2 +- Banyan/src/sessions.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 1be181e6..0a398a05 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -361,8 +361,8 @@ function partitioned_computation_concrete( # require the last value to be merged simply because it is being evaluated. sessions = get_sessions_dict() - session_id = get_session_id() session = get_session() + session_id = get_session_id() resource_id = session.resource_id diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index 1ed9b78e..3f4c7e4f 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -531,7 +531,7 @@ function get_session_status(session_id::String=get_session_id(); kwargs...)::Str end response = send_request_get_response(:describe_sessions, params) if !haskey(response["sessions"], session_id) - @warn "Session with ID $session_id is assumed to still be creating" + @warn "Session with ID $session_id is assumed to have just started creating" return "creating" end session_status = response["sessions"][session_id]["status"] From 42173a9d2f4234174af0b49bc0584185cc928bdd Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Wed, 3 Aug 2022 08:23:12 -0700 Subject: [PATCH 02/25] Implement configure_sampling, get_sampling_config, get_sample_rate, has_metadata, configure_sampling --- Banyan/Project.toml | 3 +- Banyan/src/Banyan.jl | 5 ++ Banyan/src/location.jl | 123 ++++++++++++++++++++++++++++++ Banyan/src/locations.jl | 2 +- Banyan/src/requests.jl | 2 + Banyan/src/sample.jl | 16 +++- Banyan/src/samples.jl | 22 ++++++ Banyan/src/session.jl | 27 ++++++- Banyan/src/sessions.jl | 25 ++++-- BanyanDataFrames/src/locations.jl | 2 +- BanyanDataFrames/src/pfs.jl | 2 +- BanyanHDF5/src/locations.jl | 2 +- BanyanImages/src/locations.jl | 4 +- 13 files changed, 214 insertions(+), 21 deletions(-) diff --git a/Banyan/Project.toml b/Banyan/Project.toml index 8fae7de8..8551e6a2 100644 --- a/Banyan/Project.toml +++ b/Banyan/Project.toml @@ -4,6 +4,7 @@ authors = ["Banyan "] version = "0.4.1" [deps] +AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598" AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812" @@ -17,8 +18,8 @@ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" IniFile = "83e8ac13-25f8-5344-8a64-a9f2b223428f" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LibGit2 = "76f85450-5226-5b5a-8eaa-529ad045b433" -MethodAnalysis = "85b6ec6f-f7df-4429-9514-a64bcd9ee824" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +MethodAnalysis = "85b6ec6f-f7df-4429-9514-a64bcd9ee824" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 796f022b..97941e14 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -40,6 +40,9 @@ using AWSCore, Serialization, TOML +using S3: @service +@service S3 + global BANYAN_API_ENDPOINT # Account management @@ -84,6 +87,7 @@ export AbstractFuture, Future, partitioned_computation, compute_inplace, compute export Sample, ExactSample, sample, sample_for_grouping, SampleForGrouping, setsample! export sample_memory_usage, total_memory_usage, sample_axes, sample_keys, sample_by_key export NOTHING_SAMPLE +export SamplingConfig # Locations export Location, LocationSource, LocationDestination, located, sourced, destined @@ -98,6 +102,7 @@ export get_remotepath_id, cache_location, get_max_exact_sample_length, set_max_exact_sample_length +export LocationPath # Serialization export from_jl_value_contents, to_jl_value_contents diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 2eb889af..60b0663f 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -38,4 +38,127 @@ mutable struct Location # sample # ) # end +end + +struct LocationPath + original_path::String + path::String + path_hash_uint::UInt + path_hash::String + format_name::String + format_version::String + + function LocationPath(path, format_name, format_version) + # This function is responsible for "normalizing" the path. + # If there are multiple path strings that are technically equivalent, + # this function should map them to the same string. + path_hash = hash(path) + new( + path, + path, + path_hash, + string(path_hash), + format_name, + format_version + ) + end +end + +global TABLE_FORMATS = ["csv", "parquet", "arrow"] +z +function get_location_path_with_format(p::String, kwargs...)::LocationPath + if isempty(p) + return NO_LOCATION_PATH + end + + format_name = get(kwargs, :format, "jl") + is_sample_format_arrow = format_name == "arrow" + if is_sample_format_arrow + return LocationPath(p, "arrow", get(kwargs, :format_version, "2")) + else + for table_format in TABLE_FORMATS + if occursin(table_format, p) || format_name == p + return LocationPath(p, "arrow", "2") + end + end + end + LocationPath(p, "jl", get_julia_version()) +end + +function get_sample_path_prefix(lp::LocationPath) + format_name_sep = !isempty(lp.format_name) ? "_" : "" + format_version_sep = !isempty(lp.format_version) ? "_" : "" + lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version * format_version_sep +end +get_sample_path(lp::LocationPath, sample_rate::Int64) = + get_sample_path_prefix(lp) * string(sample_rate) +get_metadata_path(lp::LocationPath) = lp.path_hash + +Base.hash(lp::LocationPath) = lp.path_hash_uint + +const NO_LOCATION_PATH = LocationPath("", "", "") + +get_sampling_config(path="", kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...)) +function get_sampling_configs() + global session_sampling_configs + session_sampling_configs[_get_session_id_no_error()] +end +get_sampling_config(l_path::LocationPath)::SamplingConfig = + get(get_sampling_configs(), l_path, sampling_configs[NO_LOCATION_PATH]) + +get_sample_rate(p::String; kwargs...) = + get_sample_rate(get_location_path_with_format(p; kwargs...)) +function get_sample_rate(l_path::LocationPath) + # Get the desired sample rate + desired_sample_rate = get_sampling_config(l_path).rate + + # Find a cached sample with a similar sample ratBucket=e + # TODO: Just have a try/catch here so that if the bucket doesn't exist we just return the default + # TODO: Make the above code get used in location constructors for getting the desired sample rate + sc = get_sampling_config(l_path) + pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) + banyan_samples_objects = try + S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"] + catch + return desired_sample_rate + end + sample_rate = -1 + for banyan_samples_object in banyan_samples_objects + object_key = banyan_samples_object["Key"] + if startswith(object_key, banyan_samples_object_prefix) + object_sample_rate = parse(Int64, object_key[(findlast("_", object_key).start+1):end]) + object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) + curr_sample_rate_diff = abs(object_sample_rate - sample_rate) + if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff + sample_rate = object_sample_rate + end + end + end + sample_rate != -1 ? sample_rate : desired_sample_rate +end + +# function get_location(l_path::LocationPath) +# sessions_dict = get_sessions_dict() +# session_id = _get_session_id_no_error() +# desired_sample_rate = if haskey(sessions_dict, session_id) +# sampling_configs = sessions_dict[session_id].sampling_configs +# get(sampling_configs, l_path, sampling_configs[NO_LOCATION_PATH]). +# end + +function has_metadata(l_path:: LocationPath)::Bool + try + !isempty(S3.list_objects_v2(Bucket="banyan_metadata", prefix=get_metadata_path(l_path))["Contents"]) + catch + false + end +end + +function has_sample(l_path:: LocationPath)::Bool + sc = get_sampling_config(l_path) + pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) + try + !isempty(S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"]) + catch + false + end end \ No newline at end of file diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index c08bc71d..12cd3888 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -293,7 +293,7 @@ getsamplenrows(totalnrows::Int64)::Int64 = totalnrows else # Must have at least 1 row - cld(totalnrows, get_session().sample_rate) + cld(totalnrows, get_sample_rate()) end # We maintain a cache of locations and a cache of samples. Locations contain diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 0a398a05..9aba3d95 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -566,6 +566,7 @@ function send_evaluation(value_id::ValueId, session_id::SessionId) "organization_id" => get_session().organization_id, "cluster_instance_id" => get_session().cluster_instance_id, "cluster_name" => get_session().cluster_name, + "sampling_configs" => sampling_configs_to_jl(get_sampling_configs()) ), ) if isnothing(response) @@ -667,6 +668,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) "organization_id" => get_session().organization_id, "cluster_instance_id" => get_session().cluster_instance_id, "cluster_name" => get_session().cluster_name, + "sampling_configs" => sampling_configs_to_jl(get_sampling_configs()) ), ) if isnothing(response) diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl index c98e1df8..105ab327 100644 --- a/Banyan/src/sample.jl +++ b/Banyan/src/sample.jl @@ -9,11 +9,11 @@ mutable struct Sample groupingkeys::Vector{<:Any} Sample() = - new(nothing, objectid(nothing), 0, get_session().sample_rate, Any[]) + new(nothing, objectid(nothing), 0, get_sample_rate(), Any[]) Sample(value::Any) = - new(value, objectid(value), sample_memory_usage(value), get_session().sample_rate, Any[]) + new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[]) function Sample(value::Any, memory_usage::Int64) - sample_rate = get_session().sample_rate + sample_rate = get_sample_rate() memory_usage = convert(Int64, round(memory_usage / sample_rate))::Int64 new(value, objectid(value), memory_usage, sample_rate, Any[]) end @@ -22,3 +22,13 @@ mutable struct Sample new(value, objectid(value), memory_usage, rate, Any[]) end end + +struct SamplingConfig + rate::Int64 + always_exact::Bool + max_num_bytes_exact::Int64 + force_new_sample_rate::Bool +end + +const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false) +session_sampling_configs = Dict{SessionId,SamplingConfig}("" => DEFAULT_SAMPLING_CONFIG) \ No newline at end of file diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 9daea353..a6210acd 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -1,3 +1,25 @@ +function configure_sampling( + path=""; + rate=nothing, + always_exact=nothing, + max_num_bytes_exact=nothing, + kwargs... +) + global session_sampling_configs + + sc = get_sampling_config(path; kwargs...) + nsc = SamplingConfig( + !isnothing(sc.rate) ? rate : sc.rate, + !isnothing(sc.always_exact) ? always_exact : sc.always_exact, + !isnothing(sc.max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact, + !isnothing(sc.force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate, + ) + + session_id = _get_session_id_no_error() + lp = get_location_path_with_format(path; kwargs...) + session_sampling_configs[session_id][lp] = nsc +end + ############################################################### # Sample that caches properties returned by an AbstractSample # ############################################################### diff --git a/Banyan/src/session.jl b/Banyan/src/session.jl index 8306fe45..cbec24ec 100644 --- a/Banyan/src/session.jl +++ b/Banyan/src/session.jl @@ -2,7 +2,6 @@ mutable struct Session id::SessionId resource_id::ResourceId nworkers::Int64 - sample_rate::Int64 locations::Dict{ValueId,Location} pending_requests::Vector{Request} # This is a `WeakKeyDict` so that futures can be GC-ed as long as all @@ -30,7 +29,6 @@ mutable struct Session session_id::SessionId, resource_id::ResourceId, nworkers::Int64, - sample_rate::Int64, organization_id::String = "", cluster_instance_id::String = "", not_using_modules::Vector{String} = NOT_USING_MODULES, @@ -44,7 +42,6 @@ mutable struct Session session_id, resource_id, nworkers, - sample_rate, Dict{ValueId,Location}(), [], Dict{ValueId,Future}(), @@ -58,7 +55,29 @@ mutable struct Session is_session_ready, scatter_queue_url, gather_queue_url, - execution_queue_url, + execution_queue_url ) end end + +function sampling_configs_to_jl(sampling_configs::Dict{LocationPath,SamplingConfig}) + res = Tuple{Tuple{String,String,String},Tuple{Int64,Bool,Int64,Bool}}[] + for (l::LocationPath, s::SamplingConfig) in sampling_configs + push!( + res, + ( + (l.original_path, l.format_name, l.format_version), + (s.rate, s.always_exact, s.max_num_bytes_exact, s.force_new_sample_rate), + ), + ) + end + res +end + +function sampling_configs_from_jl(sampling_configs) + res = Dict{LocationPath,SamplingConfig}() + for (l, s) in sampling_configs + res[LocationPath(l[1], l[2], l[3])] = SamplingConfig(s[1], s[2], s[3], s[4]) + end + res +end \ No newline at end of file diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index 3f4c7e4f..ab3fd091 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -86,7 +86,6 @@ function _start_session( store_logs_in_s3::Bool, store_logs_on_cluster::Bool, log_initialization::Bool, - sample_rate::Int64, session_name::String, files::Vector{String}, code_files::Vector{String}, @@ -109,7 +108,10 @@ function _start_session( no_email::Bool, for_running::Bool, sessions::Dict{String,Session}, + sampling_configs::Dict{LocationPath,SamplingConfig} ) + global session_sampling_configs + # Construct parameters for starting session cluster_name = if cluster_name == NOTHING_STRING running_clusters = get_running_clusters() @@ -129,7 +131,6 @@ function _start_session( session_configuration = Dict{String,Any}( "cluster_name" => cluster_name, "num_workers" => nworkers, - "sample_rate" => sample_rate, "release_resources_after" => release_resources_after == -1 ? nothing : release_resources_after, "return_logs" => print_logs, "store_logs_in_s3" => store_logs_in_s3, @@ -141,7 +142,8 @@ function _start_session( "using_modules" => using_modules, "reuse_resources" => !force_update_files, "estimate_available_memory" => estimate_available_memory, - "language" => "jl" + "language" => "jl", + "sampling_configs" => sampling_configs_to_jl(sampling_configs) ) if session_name != NOTHING_STRING session_configuration["session_name"] = session_name @@ -269,7 +271,6 @@ function _start_session( session_id, resource_id, nworkers, - sample_rate, organization_id, cluster_instance_id, not_using_modules, @@ -279,6 +280,7 @@ function _start_session( gather_queue_url=gather_queue_url, execution_queue_url=execution_queue_url ) + session_sampling_configs[session_id] = sampling_configs if !nowait wait_for_session(session_id) @@ -298,7 +300,6 @@ function start_session(; store_logs_in_s3::Bool = true, store_logs_on_cluster::Bool = false, log_initialization::Bool = false, - sample_rate::Int64 = nworkers, session_name::String = NOTHING_STRING, files::Vector{String} = String[], code_files::Vector{String} = String[], @@ -318,6 +319,10 @@ function start_session(; nowait::Bool = true, email_when_ready::Union{Bool,Nothing} = nothing, for_running::Bool = false, + always_exact=nothing, + sample_rate=nothing, + max_num_bytes_exact=nothing, + force_new_sample_rate=nothing, kwargs..., )::SessionId # Should save 5ms of overhead @@ -331,6 +336,12 @@ function start_session(; # Configure configure(; kwargs...) + configure_sampling(; + always_exact=always_exact, + sample_rate=sample_rate, + max_num_bytes_exact=max_num_bytes_exact, + force_new_sample_rate=force_new_sample_rate + ) current_session_id = _start_session( cluster_name, @@ -340,7 +351,6 @@ function start_session(; store_logs_in_s3, store_logs_on_cluster, log_initialization, - sample_rate, session_name, files, code_files, @@ -362,7 +372,8 @@ function start_session(; isnothing(email_when_ready) ? false : email_when_ready, isnothing(email_when_ready), for_running, - sessions + sessions, + get_sampling_configs() ) current_session_id end diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 73063560..9da81827 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -3,7 +3,7 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end] Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5)) function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length)::Location - session_sample_rate = get_session().sample_rate + session_sample_rate = get_sample_rate() is_main = is_main_worker() # Get cached Location and if it has valid parameters and sample, return diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index a935365d..98027383 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -517,7 +517,7 @@ function WriteHelper(@nospecialize(format_value)) # Gather # of rows, # of bytes, empty sample, and actual sample nbytes = part_res isa Empty ? 0 : Banyan.total_memory_usage(part_res) - sample_rate = get_session().sample_rate + sample_rate = get_sample_rate() sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows) gathered_data = gather_across((nrows, nbytes, part_res isa Empty ? part_res : empty(part_res), sampled_part), comm) diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 1120e83b..537bf573 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -28,7 +28,7 @@ HDF5_getindex_retry = retry(HDF5.getindex; delays=Base.ExponentialBackOff(; n=5) function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length) # Get session information - session_sample_rate = get_session().sample_rate + session_sample_rate = get_sample_rate() worker_idx, nworkers = get_worker_idx(), get_nworkers() is_main = worker_idx == 1 diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index 9ce67042..eb244b33 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -98,7 +98,7 @@ # if isnothing(remote_sample) -# samplesize = (nimages <= MAX_EXACT_SAMPLE_NUM_IMAGES) ? nimages : ceil(Int64, nimages / get_session().sample_rate) +# samplesize = (nimages <= MAX_EXACT_SAMPLE_NUM_IMAGES) ? nimages : ceil(Int64, nimages / get_sample_rate()) # nbytes_of_sample = 0 # progressbar = Progress(length(files_to_read_from), "Collecting sample from $remotepath") @@ -282,7 +282,7 @@ function _remote_image_source( add_channelview ) # Get session information - session_sample_rate = get_session().sample_rate + session_sample_rate = get_sample_rate() worker_idx, nworkers = get_worker_idx(), get_nworkers() is_main = worker_idx == 1 From d8ebaf20fcacece7f3127f0b34a66088af0c6d77 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Fri, 5 Aug 2022 09:51:07 -0700 Subject: [PATCH 03/25] Implement get_location_source and get_organization_id --- Banyan/Project.toml | 2 + Banyan/src/Banyan.jl | 1 + Banyan/src/location.jl | 187 +++++++++++++++++++++++++++++++++++++---- Banyan/src/utils.jl | 21 +++++ 4 files changed, 194 insertions(+), 17 deletions(-) diff --git a/Banyan/Project.toml b/Banyan/Project.toml index 8551e6a2..02ad86b3 100644 --- a/Banyan/Project.toml +++ b/Banyan/Project.toml @@ -8,6 +8,7 @@ AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598" AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812" +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" @@ -33,6 +34,7 @@ TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53" AWSCore = "0.6" AWSS3 = "0.7" AWSSQS = "0.6" +Arrow = "2" DataStructures = "0.18" Downloads = "^1.4" FileIO = "1.9.1" diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 97941e14..5a34287c 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -40,6 +40,7 @@ using AWSCore, Serialization, TOML +using AWS.AWSServices: s3 using S3: @service @service S3 diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 60b0663f..006752f5 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -93,6 +93,8 @@ end get_sample_path(lp::LocationPath, sample_rate::Int64) = get_sample_path_prefix(lp) * string(sample_rate) get_metadata_path(lp::LocationPath) = lp.path_hash +banyan_samples_bucket_name() = "banyan-samples-$(get_organization_id())" +banyan_metadata_bucket_name() = "banyan-metadata-$(get_organization_id())" Base.hash(lp::LocationPath) = lp.path_hash_uint @@ -108,25 +110,30 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig = get_sample_rate(p::String; kwargs...) = get_sample_rate(get_location_path_with_format(p; kwargs...)) +parse_sample_rate(object_key) = + parse(Int64, object_key[(findlast("_", object_key).start+1):end]) function get_sample_rate(l_path::LocationPath) # Get the desired sample rate desired_sample_rate = get_sampling_config(l_path).rate - # Find a cached sample with a similar sample ratBucket=e - # TODO: Just have a try/catch here so that if the bucket doesn't exist we just return the default - # TODO: Make the above code get used in location constructors for getting the desired sample rate sc = get_sampling_config(l_path) - pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) + if sc.force_new_sample_rate + return desired_sample_rate + end + + # Find a cached sample with a similar sample rate + pre = get_sample_path_prefix(l_path) banyan_samples_objects = try - S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"] + res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"] + res isa Base.Vector ? res : [res] catch return desired_sample_rate end sample_rate = -1 for banyan_samples_object in banyan_samples_objects object_key = banyan_samples_object["Key"] - if startswith(object_key, banyan_samples_object_prefix) - object_sample_rate = parse(Int64, object_key[(findlast("_", object_key).start+1):end]) + if startswith(object_key, pre) + object_sample_rate = parse_sample_rate(object_key) object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) curr_sample_rate_diff = abs(object_sample_rate - sample_rate) if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff @@ -137,17 +144,9 @@ function get_sample_rate(l_path::LocationPath) sample_rate != -1 ? sample_rate : desired_sample_rate end -# function get_location(l_path::LocationPath) -# sessions_dict = get_sessions_dict() -# session_id = _get_session_id_no_error() -# desired_sample_rate = if haskey(sessions_dict, session_id) -# sampling_configs = sessions_dict[session_id].sampling_configs -# get(sampling_configs, l_path, sampling_configs[NO_LOCATION_PATH]). -# end - function has_metadata(l_path:: LocationPath)::Bool try - !isempty(S3.list_objects_v2(Bucket="banyan_metadata", prefix=get_metadata_path(l_path))["Contents"]) + !isempty(S3.list_objects_v2(Bucket=banyan_metadata_bucket_name(), prefix=get_metadata_path(l_path))["Contents"]) catch false end @@ -157,8 +156,162 @@ function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) try - !isempty(S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"]) + !isempty(S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"]) catch false end +end + +twodigit(i::Int64) = i < 10 ? ("0" * string(i)) : string(i) + +get_src_params_dict(d::Union{Nothing,Base.ImmutableDict{String, String}}) = + isnothing(d) ? Dict{String,String}() : Dict{String,String}(d) + +get_src_params_dict_from_arrow(p) = Arrow.Table(p) |> Arrow.getmetadata |> get_src_params_dict + +struct AWSExceptionInfo + is_aws::Bool + unmodified_since::Bool + not_found::Bool + + function AWSExceptionInfo(e) + is_aws = e isa AWSException && e.cause isa AWS.HTTP.ExceptionRequest.StatusError + new(is_aws, is_aws && e.cause.status == 304, is_aws && e.cause.status == 404) + end +end + +function get_location_source(lp::LocationPath)::Tuple{Location,String} + # Load in metadata + metadata_path = get_metadata_path(lp) + metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path) + metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path" + src_params::Dict{String, String} = if exists(metadata_local_path) + lm = Dates.unix2datetime(mtime(metadata_local_path)) + if_modified_since_string = + "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" + try + get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))) + catch e + if is_debug_on() + show(e) + end + ei = AWSExceptionInfo(e) + if ei.not_found + Dict{String, String}() + elseif ei.unmodified_since + get_src_params_dict_from_arrow(metadata_local_path) + else + @warn "Assumming locally stored metadata is invalid because of following error in accessing the metadata copy in the cloud" + show(e) + Dict{String, String}() + end + end + else + try + get_src_params_dict_from_arrow(s3("GET", metadata_s3_path)) + catch e + if is_debug_on() + show(e) + end + if !AWSExceptionInfo(e).not_found + @warn "Assumming metadata isn't copied in the cloud because of following error in attempted access" + show(e) + end + Dict{String, String}() + end + end + + # Load in sample + + sc = get_sampling_config() + force_new_sample_rate = sc.force_new_sample_rate + desired_sample_rate = sc.rate + sample_path_prefix = get_sample_path_prefix(lp) + + # Find local samples + found_local_samples = Tuple{String,Int64}[] + found_local_sample_rate_diffs = Int64[] + samples_local_dir = joinpath(homedir(), ".banyan", "samples") + for local_sample_path in readdir(samples_local_dir, join=true) + if startswith(local_sample_path, sample_path_prefix) + local_sample_rate = parse_sample_rate(object_key) + diff_sample_rate = abs(local_sample_rate - desired_sample_rate) + if !force_new_sample_rate || sample_rate_diff == 0 + push!(found_local_samples, (local_sample_path, local_sample_rate)) + push!(found_local_sample_rate_diffs, diff_sample_rate) + end + end + end + + # Sort in descending suitability (the most suitable sample is the one with sample + # rate closest to the desired sample rate) + found_local_samples = found_local_samples[sortperm(found_local_sample_rate_diffs)] + + # Find a local sample that is up-to-date + final_local_sample_path = "" + for (sample_local_path, sample_rate) in found_local_samples + lm = Dates.unix2datetime(mtime(sample_local_path)) + if_modified_since_string = + "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" + sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix$sample_rate" + try + blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))) + write(sample_local_path, blob) # This overwrites the existing file + final_local_sample_path = sample_local_path + break + catch e + if is_debug_on() + show(e) + end + ei = AWSExceptionInfo(e) + if ei.not_found + @warn "Assumming locally stored metadata is invalid because it is not backed up to the cloud" + elseif ei.unmodified_since + final_local_sample_path = sample_local_path + break + else + @warn "Assumming locally stored metadata is invalid because of following error in accessing the metadata copy in the cloud" + show(e) + end + end + end + + # If no such sample is found, search the S3 bucket + banyan_samples_objects = try + res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=sample_path_prefix)["Contents"] + res isa Base.Vector ? res : [res] + catch e + if is_debug_on() + show(e) + end + [] + end + banyan_samples_object_sample_rate = -1 + for banyan_samples_object in banyan_samples_objects + object_key = banyan_samples_object["Key"] + if startswith(object_key, banyan_samples_object_prefix) + object_sample_rate = parse_sample_rate(object_key) + object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) + curr_sample_rate_diff = abs(object_sample_rate - sample_rate) + if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff + banyan_samples_object_sample_rate = object_sample_rate + end + end + end + if banyan_samples_object_sample_rate != -1 + sample_path_suffix = "$sample_path_prefix$banyan_samples_object_sample_rate" + blob = s3("GET", "/$(banyan_samples_bucket_name())/$sample_path_suffix") + final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix) + write(final_local_sample_path, blob) + end + + res_location = LocationSource( + get(src_params, "name", "Remote"), + src_params, + get(src_params, "total_memory_usage", 0), + NOTHING_SAMPLE + ) + res_location.parameters_invalid = isempty(src_params) + res_location.sample_invalid = isempty(final_local_sample_path) + (res_location, final_local_sample_path) end \ No newline at end of file diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index 53b96b74..3b02e192 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -199,6 +199,25 @@ function configure(user_id, api_key, ec2_key_pair_name, banyanconfig_path) return banyan_config end +# Getting organization IDs + +organization_ids = Dict{String,String} +function get_organization_id() + global organization_ids + global sessions + user_id = configure()["banyan"]["user_id"] + session_id = _get_session_id_no_error() + if haskey(organization_ids, user_id) + organization_ids[user_id] + elseif haskey(sessions, session_id) + sessions[session_id].organization_ids + else + organization_id = send_request_get_response(:describe_users, Dict())["organization_id"] + organization_ids[user_id] = organization_id + organization_id + end +end + @specialize """ @@ -293,6 +312,8 @@ method_to_string(method::Symbol)::String = begin "update-cluster" elseif method == :set_cluster_ready "set-cluster-ready" + elseif method == :describe_users + "describe-users" end end From fbf648170cd1e12a5c7f59a997dec66651434f59 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Sat, 6 Aug 2022 19:51:39 -0700 Subject: [PATCH 04/25] Make changes to location constructor and PFs for BDF.jl --- Banyan/src/Banyan.jl | 6 +- Banyan/src/location.jl | 45 +++- Banyan/src/locations.jl | 20 +- Banyan/src/precompile.jl | 2 +- Banyan/src/queues.jl | 4 +- Banyan/src/requests.jl | 14 +- Banyan/src/sample.jl | 19 +- Banyan/src/samples.jl | 27 ++- Banyan/src/sessions.jl | 11 +- Banyan/src/utils_pfs.jl | 6 +- BanyanDataFrames/src/locations.jl | 256 +++++++++++++-------- BanyanDataFrames/src/pfs.jl | 100 ++++---- BanyanDataFrames/src/utils_pfs.jl | 9 + BanyanDataFrames/test/sample_collection.jl | 11 +- BanyanHDF5/src/locations.jl | 4 +- BanyanImages/src/locations.jl | 16 +- BanyanImages/src/pfs.jl | 2 +- BanyanImages/test/pfs.jl | 4 +- 18 files changed, 335 insertions(+), 221 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 5a34287c..f51bad16 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -106,7 +106,7 @@ export get_remotepath_id, export LocationPath # Serialization -export from_jl_value_contents, to_jl_value_contents +export from_jl_string, to_jl_string # Queues export receive_from_client, send_to_client, get_sqs_dict_from_url @@ -171,8 +171,8 @@ export is_debug_on, get_partition_idx_from_divisions, isoverlapping, to_jl_value, - to_jl_value_contents, - from_jl_value_contents, + to_jl_string, + from_jl_string, get_divisions, getpath, buftovbuf, diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 006752f5..93e41110 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -10,7 +10,7 @@ mutable struct Location dst_parameters::LocationParameters total_memory_usage::Int64 sample::Sample - parameters_invalid::Bool + metadata_invalid::Bool sample_invalid::Bool # function Location( @@ -106,9 +106,11 @@ function get_sampling_configs() session_sampling_configs[_get_session_id_no_error()] end get_sampling_config(l_path::LocationPath)::SamplingConfig = - get(get_sampling_configs(), l_path, sampling_configs[NO_LOCATION_PATH]) + let scs = get_sampling_configs() + get(scs, l_path, scs[NO_LOCATION_PATH]) + end -get_sample_rate(p::String; kwargs...) = +get_sample_rate(p::String=""; kwargs...) = get_sample_rate(get_location_path_with_format(p; kwargs...)) parse_sample_rate(object_key) = parse(Int64, object_key[(findlast("_", object_key).start+1):end]) @@ -116,6 +118,11 @@ function get_sample_rate(l_path::LocationPath) # Get the desired sample rate desired_sample_rate = get_sampling_config(l_path).rate + # If we just want the default sample rate or if a new sample rate is being + # forced, then just return that. + if isempty(l_path.path) + return desired_sample_rate + end sc = get_sampling_config(l_path) if sc.force_new_sample_rate return desired_sample_rate @@ -180,17 +187,24 @@ struct AWSExceptionInfo end end -function get_location_source(lp::LocationPath)::Tuple{Location,String} +function get_location_source(lp::LocationPath)::Tuple{Location,String,String} + # This checks local cache and S3 cache for sample and metadata files. + # It then returns a Location object (with a null sample) and the local file names + # to read/write the metadata and sample from/to. + # Load in metadata metadata_path = get_metadata_path(lp) metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path) metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path" + src_params_not_stored_locally = false src_params::Dict{String, String} = if exists(metadata_local_path) lm = Dates.unix2datetime(mtime(metadata_local_path)) if_modified_since_string = "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" try - get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))) + d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))) + src_params_not_stored_locally = true + d catch e if is_debug_on() show(e) @@ -208,7 +222,9 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String} end else try - get_src_params_dict_from_arrow(s3("GET", metadata_s3_path)) + d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path)) + src_params_not_stored_locally = true + d catch e if is_debug_on() show(e) @@ -220,6 +236,10 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String} Dict{String, String}() end end + # Store metadata locally + if src_params_not_stored_locally && !isempty(d) + Arrow.write(metadata_local_path, Arrow.Table(); metadata=src_params) + end # Load in sample @@ -304,14 +324,19 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String} final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix) write(final_local_sample_path, blob) end - + + # Construct and return LocationSource res_location = LocationSource( get(src_params, "name", "Remote"), src_params, - get(src_params, "total_memory_usage", 0), + parse(Int64, get(src_params, "total_memory_usage", "0")), NOTHING_SAMPLE ) - res_location.parameters_invalid = isempty(src_params) + res_location.metadata_invalid = isempty(src_params) res_location.sample_invalid = isempty(final_local_sample_path) - (res_location, final_local_sample_path) + ( + res_location, + metaata_local_path, + isempty(final_local_sample_path) ? final_local_sample_path : "sample_path_prefix$desired_sample_rate" + ) end \ No newline at end of file diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 12cd3888..f1564ed5 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -68,7 +68,7 @@ function sourced(fut::Future, loc::Location) # Otherwise just make a fresh new sample. Sample() end, - loc.parameters_invalid, + loc.metadata_invalid, loc.sample_invalid ), ) @@ -92,7 +92,7 @@ function sourced(fut::Future, loc::Location) # location if there is one. fut_location.sample end, - loc.parameters_invalid, + loc.metadata_invalid, loc.sample_invalid ), ) @@ -116,7 +116,7 @@ function destined(fut::Future, loc::Location) loc.dst_parameters, fut_location.total_memory_usage, Sample(), - loc.parameters_invalid, + loc.metadata_invalid, loc.sample_invalid ), ) @@ -131,7 +131,7 @@ function destined(fut::Future, loc::Location) loc.dst_parameters, fut_location.total_memory_usage, fut_location.sample, - fut_location.parameters_invalid, + fut_location.metadata_invalid, fut_location.sample_invalid ), ) @@ -219,7 +219,7 @@ Size(val)::Location = LocationSource( "Value", Dict{String,Any}("value" => to_jl_value(val)), 0, - Sample(indexapply(getsamplenrows, val, 1)), + Sample(indexapply(getsamplenrows, val, 1), 1), ) function Client(val::T)::Location where {T} @@ -313,7 +313,7 @@ _invalidate_metadata(remotepath) = let p = get_location_path(remotepath) if isfile(p) loc = deserialize_retry(p) - loc.parameters_invalid = true + loc.metadata_invalid = true serialize(p, loc) end end @@ -368,10 +368,10 @@ function get_cached_location(remotepath, remotepath_id, metadata_invalid, sample INVALID_LOCATION end curr_location.sample_invalid = curr_location.sample_invalid || sample_invalid - curr_location.parameters_invalid = curr_location.parameters_invalid || metadata_invalid + curr_location.metadata_invalid = curr_location.metadata_invalid || metadata_invalid curr_sample_invalid = curr_location.sample_invalid - curr_parameters_invalid = curr_location.parameters_invalid - curr_location, curr_sample_invalid, curr_parameters_invalid + curr_metadata_invalid = curr_location.metadata_invalid + curr_location, curr_sample_invalid, curr_metadata_invalid end get_cached_location(remotepath, metadata_invalid, sample_invalid) = @@ -381,7 +381,7 @@ function cache_location(remotepath, remotepath_id, location_res::Location, inval location_path = get_location_path(remotepath, remotepath_id) location_to_write = deepcopy(location_res) location_to_write.sample_invalid = location_to_write.sample_invalid || invalidate_sample - location_to_write.parameters_invalid = location_to_write.parameters_invalid || invalidate_metadata + location_to_write.metadata_invalid = location_to_write.metadata_invalid || invalidate_metadata serialize(location_path, location_to_write) end cache_location(remotepath, location_res::Location, invalidate_sample, invalidate_metadata) = diff --git a/Banyan/src/precompile.jl b/Banyan/src/precompile.jl index 9be89220..e3e89a2f 100644 --- a/Banyan/src/precompile.jl +++ b/Banyan/src/precompile.jl @@ -296,7 +296,7 @@ function _precompile_() precompile(download_remote_path, (String,)) precompile(download_remote_s3_path, (String,)) Base.precompile(Tuple{typeof(sqs_get_queue_with_retries),Dict{Symbol, Any},Vararg{Any}}) # time: 0.24037404 - precompile(to_jl_value_contents, (Function,)) + precompile(to_jl_string, (Function,)) # futures.jl precompile(create_new_future, (Location, Future, String)) diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index 6e231a7d..2cc6c23c 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -132,7 +132,7 @@ function receive_from_client(value_id::ValueId) ) # Receive response from client m = JSON.parse(get_next_message(get_scatter_queue())[1]) - v = from_jl_value_contents(m["contents"]::String) + v = from_jl_string(m["contents"]::String) v end @@ -153,7 +153,7 @@ end function send_to_client(value_id::ValueId, value, worker_memory_used = 0) MAX_MESSAGE_LENGTH = 220_000 - message = to_jl_value_contents(value)::String + message = to_jl_string(value)::String i = 1 while true is_last_message = length(message) <= MAX_MESSAGE_LENGTH diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 9aba3d95..6be89660 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -24,7 +24,7 @@ )::Tuple{Union{Nothing,String},Union{Nothing,DateTime}} value_id = message["value_id"]::ValueId if value_id == "-2" && isnothing(error_for_main_stuck_time) - error_for_main_stuck_msg::String = from_jl_value_contents(message["contents"]::String) + error_for_main_stuck_msg::String = from_jl_string(message["contents"]::String) if contains(error_for_main_stuck_msg, "session $(get_session_id())") error_for_main_stuck = error_for_main_stuck_msg error_for_main_stuck_time = Dates.now() @@ -277,7 +277,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n JSON.json( Dict{String,Any}( "value_id" => value_id, - "contents" => to_jl_value_contents(f.value) + "contents" => to_jl_string(f.value) ), ), ) @@ -296,7 +296,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n contents = get(partial_gathers, value_id, "") * message["contents"]::String # @debug "Received gather request for $value_id" if haskey(session.futures_on_client, value_id) - value = from_jl_value_contents(contents) + value = from_jl_string(contents) f = session.futures_on_client[value_id]::Future f.value = value # TODO: Update stale/mutated here to avoid costly @@ -623,11 +623,11 @@ end # Make the `offloaded` function on the client side keep looping and # (1) checking receive_next_message and # (2) checking for message[“kind”] == "GATHER" and -# (3) `break`ing and `return`ing the value (using `from_jl_value_contents(message["contents"])`) +# (3) `break`ing and `return`ing the value (using `from_jl_string(message["contents"])`) # if value_id == -1 # Make `offloaded` function in Banyan.jl # which calls evaluate passing in a string of bytes -# by serializing the given function (just call to_jl_value_contents on it) +# by serializing the given function (just call to_jl_string on it) # and passing it in with the parameter offloaded_function_code # # Make `offloaded` function specify @@ -642,7 +642,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) # doesn't need information about memory usage from intiial package loading. # Get serialized function - serialized::String = to_jl_value_contents((given_function, args)) + serialized::String = to_jl_string((given_function, args)) # Submit evaluation request !isempty(get_session().organization_id) || error("Organization ID not stored locally for this session") @@ -713,7 +713,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) # recompute the initial available memory every time we start a session # and this should presumably include the offloaded memory usage. get_session().worker_memory_used = get_session().worker_memory_used + memory_used - stored_message = from_jl_value_contents(contents) + stored_message = from_jl_string(contents) end error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(message, error_for_main_stuck, error_for_main_stuck_time) elseif (message_type == "EVALUATION_END") diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl index 105ab327..1837e244 100644 --- a/Banyan/src/sample.jl +++ b/Banyan/src/sample.jl @@ -10,16 +10,16 @@ mutable struct Sample Sample() = new(nothing, objectid(nothing), 0, get_sample_rate(), Any[]) - Sample(value::Any) = - new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[]) - function Sample(value::Any, memory_usage::Int64) - sample_rate = get_sample_rate() - memory_usage = convert(Int64, round(memory_usage / sample_rate))::Int64 + # Sample(value::Any) = + # new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[]) + function Sample(value::Any, total_memory_usage::Int64, sample_rate::Int64) + # sample_rate = get_sample_rate() + memory_usage = convert(Int64, round(total_memory_usage / sample_rate))::Int64 new(value, objectid(value), memory_usage, sample_rate, Any[]) end - function Sample(value::Any, memory_usage::Int64, rate::Int64) + function Sample(value::Any, sample_rate::Int64) # This is only for the NOTHING_SAMPLE and ExactSample - new(value, objectid(value), memory_usage, rate, Any[]) + new(value, objectid(value), sample_memory_usage(value), sample_rate, Any[]) end end @@ -28,7 +28,8 @@ struct SamplingConfig always_exact::Bool max_num_bytes_exact::Int64 force_new_sample_rate::Bool + assume_shuffled::Bool end -const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false) -session_sampling_configs = Dict{SessionId,SamplingConfig}("" => DEFAULT_SAMPLING_CONFIG) \ No newline at end of file +const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false, true) +session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG)) \ No newline at end of file diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index a6210acd..7d93572d 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -1,30 +1,41 @@ function configure_sampling( path=""; - rate=nothing, + sample_rate=nothing, always_exact=nothing, max_num_bytes_exact=nothing, + force_new_sample_rate=nothing, + assume_shuffled=nothing, + for_all_locations=false, kwargs... ) global session_sampling_configs sc = get_sampling_config(path; kwargs...) nsc = SamplingConfig( - !isnothing(sc.rate) ? rate : sc.rate, - !isnothing(sc.always_exact) ? always_exact : sc.always_exact, - !isnothing(sc.max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact, - !isnothing(sc.force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate, + !isnothing(sample_rate) ? rate : sc.rate, + !isnothing(always_exact) ? always_exact : sc.always_exact, + !isnothing(max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact, + !isnothing(force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate, + !isnothing(assume_shuffled) ? assume_shuffled : sc.assume_shuffled, ) session_id = _get_session_id_no_error() lp = get_location_path_with_format(path; kwargs...) - session_sampling_configs[session_id][lp] = nsc + sampling_configs = session_sampling_configs[session_id] + if for_all_locations + empty!(sampling_configs) + sampling_configs[NO_LOCATION_PATH] = nsc + else + sampling_configs[lp] = nsc + end + end ############################################################### # Sample that caches properties returned by an AbstractSample # ############################################################### -ExactSample(value::Any) = Sample(value, sample_memory_usage(value), 1) +ExactSample(value::Any) = Sample(value, 1) ExactSample(value::Any, memory_usage::Int64) = Sample(value, memory_usage, 1) function setsample!(fut::Future, value::Any) @@ -188,7 +199,7 @@ function sample_max(A::T, key::K) where {T,K} isempty(A) ? nothing : _maximum(orderinghashes(A, key)) end -const NOTHING_SAMPLE = Sample(nothing, -1, -1) +const NOTHING_SAMPLE = Sample(nothing, UInt(0), Int64(-1), Int64(-1), Int64[]) Base.isnothing(s::Sample) = s.rate == -1 diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index ab3fd091..ea1945e5 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -319,10 +319,6 @@ function start_session(; nowait::Bool = true, email_when_ready::Union{Bool,Nothing} = nothing, for_running::Bool = false, - always_exact=nothing, - sample_rate=nothing, - max_num_bytes_exact=nothing, - force_new_sample_rate=nothing, kwargs..., )::SessionId # Should save 5ms of overhead @@ -336,12 +332,7 @@ function start_session(; # Configure configure(; kwargs...) - configure_sampling(; - always_exact=always_exact, - sample_rate=sample_rate, - max_num_bytes_exact=max_num_bytes_exact, - force_new_sample_rate=force_new_sample_rate - ) + configure_sampling(; kwargs...) current_session_id = _start_session( cluster_name, diff --git a/Banyan/src/utils_pfs.jl b/Banyan/src/utils_pfs.jl index ef82e907..9def168c 100644 --- a/Banyan/src/utils_pfs.jl +++ b/Banyan/src/utils_pfs.jl @@ -194,10 +194,10 @@ isoverlapping(a::AbstractRange, b::AbstractRange) = a.start ≤ b.stop && b.star @nospecialize -to_jl_value(jl) = Dict{String,Any}("is_banyan_value" => true, "contents" => to_jl_value_contents(jl)) +to_jl_value(jl) = Dict{String,Any}("is_banyan_value" => true, "contents" => to_jl_string(jl)) # NOTE: This function is shared between the client library and the PT library -function to_jl_value_contents(jl)::String +function to_jl_string(jl)::String # Handle functions defined in a module # TODO: Document this special case # if jl isa Function && !(isdefined(Base, jl) || isdefined(Core, jl) || isdefined(Main, jl)) @@ -211,7 +211,7 @@ function to_jl_value_contents(jl)::String end # NOTE: This function is shared between the client library and the PT library -function from_jl_value_contents(jl_value_contents::String) +function from_jl_string(jl_value_contents::String) # Converty string to Julia object io = IOBuffer() iob64_decode = Base64DecodePipe(io) diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 9da81827..36610a09 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -2,23 +2,31 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end] Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5)) -function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length)::Location - session_sample_rate = get_sample_rate() +function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location + # Setup for sampling + remotepath = lp.path + sampling_config = get_sampling_config(lp) + shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact + # TODO: Replace `max_exact_sample_length` with `max_num_bytes_exact` is_main = is_main_worker() # Get cached Location and if it has valid parameters and sample, return - curr_location, curr_sample_invalid, curr_parameters_invalid = get_cached_location(remotepath, metadata_invalid, sample_invalid) - if !curr_parameters_invalid && !curr_sample_invalid - return curr_location + curr_metadata_invalid, curr_sample_invalid = loc.metadata_invalid, loc.sample_invalid + if !curr_metadata_invalid && !curr_sample_invalid + return loc end # There are two things we cache for each call `to _remote_table_source`: - # 1. A `Location` serialized to a `location_path` - # 2. Metadata stored in an Arrow file at `meta_path` + # 1. sample + # 2. metadata + + # Get paths for writing sample and metadata + metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" + sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)" # Get metadata if it is still valid - curr_meta::Arrow.Table = if !curr_parameters_invalid - Arrow_Table_retry(curr_location.src_parameters["meta_path"]::String) + curr_meta::Arrow.Table = if !curr_metadata_invalid + Arrow_Table_retry(metadata_path) else Arrow.Table() end @@ -31,7 +39,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv # Get list of local paths. Note that in the future when we support a list of # Internet locations, we will want to only call getpath laterin this code when/if # we actually read stuff in. - localpaths::Base.Vector{String}, remotepaths::Base.Vector{String} = if !curr_parameters_invalid + localpaths::Base.Vector{String}, remotepaths::Base.Vector{String} = if !curr_metadata_invalid remotepaths_res = convert(Base.Vector{String}, curr_meta[:path]) map(getpath, remotepaths_res), remotepaths_res else @@ -52,7 +60,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv String[localpath], String[remotepath] end end - curr_meta_nrows::Base.Vector{Int64} = !curr_parameters_invalid ? convert(Base.Vector{Int64}, curr_meta[:nrows]) : Int64[] + curr_meta_nrows::Base.Vector{Int64} = !curr_metadata_invalid ? convert(Base.Vector{Int64}, curr_meta[:nrows]) : Int64[] local_paths_on_curr_worker::Base.Vector{String} = split_across(localpaths) # Get format @@ -61,14 +69,14 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv format_has_separate_metadata = has_separate_metadata(format_value) # Get nrows, nbytes for each file in local_paths_on_curr_worker - meta_nrows_on_worker::Base.Vector{Int64} = if curr_parameters_invalid + meta_nrows_on_worker::Base.Vector{Int64} = if curr_metadata_invalid meta_nrows_on_worker_res = Base.zeros(length(local_paths_on_curr_worker)) - if format_has_separate_metadata - for (i, local_path_on_curr_worker) in enumerate(local_paths_on_curr_worker) - path_nrows_on_worker = get_metadata(format_value, local_path_on_curr_worker) - meta_nrows_on_worker_res[i] = path_nrows_on_worker - end - end + # if format_has_separate_metadata + # for (i, local_path_on_curr_worker) in enumerate(local_paths_on_curr_worker) + # path_nrows_on_worker = get_metadata(format_value, local_path_on_curr_worker) + # meta_nrows_on_worker_res[i] = path_nrows_on_worker + # end + # end # If this format doesn't have separate metadata, we will have to # read it in later along with the sample itself. meta_nrows_on_worker_res @@ -77,36 +85,39 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv end if Banyan.INVESTIGATING_COLLECTING_SAMPLES - println("In _remote_table_source on get_worker_idx()=$(get_worker_idx()) with curr_sample_invalid=$curr_sample_invalid, curr_parameters_invalid=$curr_parameters_invalid, localpaths=$localpaths, remotepaths=$remotepaths, local_paths_on_curr_worker=$local_paths_on_curr_worker, meta_nrows_on_worker=$meta_nrows_on_worker") + println("In _remote_table_source on get_worker_idx()=$(get_worker_idx()) with curr_sample_invalid=$curr_sample_invalid, curr_metadata_invalid=$curr_metadata_invalid, localpaths=$localpaths, remotepaths=$remotepaths, local_paths_on_curr_worker=$local_paths_on_curr_worker, meta_nrows_on_worker=$meta_nrows_on_worker") end # Compute the total # of rows so that if the current sample is invalid # we can determine whether to get an exact or inexact sample and # otherwise so that we can update the sample rate. - total_nrows_res = if curr_parameters_invalid - if format_has_separate_metadata - reduce_and_sync_across(+, sum(meta_nrows_on_worker)) - else + total_nrows_res = if curr_metadata_invalid + # if format_has_separate_metadata + # reduce_and_sync_across(+, sum(meta_nrows_on_worker)) + # else # For formats with metadata stored with the data (CSV), we # determine the # of rows later in the below case where # `!is_metadata_valid``. -1 - end + # end else - curr_location.src_parameters["nrows"] + parse(Int64, loc.src_parameters["nrows"]) end - exact_sample_needed = total_nrows_res < max_exact_sample_length + total_nbytes = curr_metadata_invalid ? -1 : parse(Int64, loc.src_parameters["total_memory_usage"]) + exact_sample_needed = sampling_config.always_exact || total_nbytes <= max_num_bytes_exact # inv: (a) `meta_nrows_on_worker`, (b) `total_nrows_res`, and # (c) `exact_sample_needed` are only valid if either the format has # separate metadata (like Parquet and Arrow) or the metadata is already # stored and valid. - is_metadata_valid = format_has_separate_metadata || !curr_parameters_invalid + # NOTE: Actually - we changed this because we no longer use + # is_metadata_valid = format_has_separate_metadata || !curr_metadata_invalid + is_metadata_valid = !curr_metadata_invalid # If the metadata isn't valid then we anyway have to read in all the data # so we can't leverage the data being shuffled by only reading in some of the files shuffled = shuffled && is_metadata_valid && !exact_sample_needed - # Get sample and also metadata if not yet valid at this point + # Get sample and also metadata if not yet valid!curr_metadata_invalid at this point recollected_sample_needed = curr_sample_invalid || !is_metadata_valid if Banyan.INVESTIGATING_COLLECTING_SAMPLES println("In _remote_table_source on get_worker_idx()=$(get_worker_idx()) with is_metadata_valid=$is_metadata_valid, shuffled = $shuffled, recollected_sample_needed=$recollected_sample_needed") @@ -126,7 +137,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv perm_for_shuffling = randperm(length(meta_nrows_on_worker)) shuffled_meta_nrows_on_worker = meta_nrows_on_worker[perm_for_shuffling] nrows_on_worker_so_far = 0 - nrows_on_worker_target = cld(sum(meta_nrows_on_worker), session_sample_rate) + nrows_on_worker_target = cld(sum(meta_nrows_on_worker), sample_rate) nfiles_on_worker_res = 0 for nrows_on_worker in shuffled_meta_nrows_on_worker nrows_on_worker_so_far += nrows_on_worker @@ -151,11 +162,11 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv let df = get_sample( format_value, local_path_on_curr_worker, - (shuffled || exact_sample_needed) ? 1.0 : session_sample_rate, + (shuffled || exact_sample_needed) ? 1.0 : sample_rate, meta_nrows_for_worker[i]::Int64 ) if Banyan.INVESTIGATING_COLLECTING_SAMPLES - println("Sampling on get_worker_idx()=$(get_worker_idx()) from local_path_on_curr_worker=$local_path_on_curr_worker with session_sample_rate=$session_sample_rate with meta_nrows_for_worker[i]=$(meta_nrows_for_worker[i]) and i=$i with nrow(df)=$(DataFrames.nrow(df)) and nrows_extra_on_worker=$nrows_extra_on_worker") + println("Sampling on get_worker_idx()=$(get_worker_idx()) from local_path_on_curr_worker=$local_path_on_curr_worker with sample_rate=$sample_rate with meta_nrows_for_worker[i]=$(meta_nrows_for_worker[i]) and i=$i with nrow(df)=$(DataFrames.nrow(df)) and nrows_extra_on_worker=$nrows_extra_on_worker") end if shuffled && i == nfiles_on_worker && nrows_extra_on_worker > 0 df[1:(end-nrows_extra_on_worker), :] @@ -175,24 +186,28 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv # just have been read from the Arrow metadata file. local_nrows = 0 - for exact_sample_needed_res in [false, true] + for exact_sample_needed_res in (sampling_config.always_exact ? [true] : [false, true]) # First see if we can get a random (inexact sample). empty!(local_samples) local_nrows = 0 + local_nbytes = 0 for (i, local_path_on_curr_worker) in enumerate(local_paths_on_curr_worker) + path_sample_rate = exact_sample_needed_res ? 1.0 : sample_rate path_sample, path_nrows = get_sample_and_metadata( format_value, local_path_on_curr_worker, - exact_sample_needed_res ? 1.0 : session_sample_rate + path_sample_rate ) meta_nrows_on_worker[i] = path_nrows push!(local_samples, path_sample) local_nrows += path_nrows + local_nbytes += ceil(Int64, total_memory_usage(path_sample) * path_sample_rate) end total_nrows_res = reduce_and_sync_across(+, local_nrows) + total_nbytes_res = reduce_and_sync_across(+, local_nbytes) # If the sample is too small, redo it, getting an exact sample - if !exact_sample_needed_res && total_nrows_res < max_exact_sample_length + if !exact_sample_needed_res && total_nbytes_res < max_exact_sample_length exact_sample_needed = true exact_sample_needed_res = true else @@ -207,7 +222,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv local_sample::DataFrames.DataFrame = isempty(local_samples) ? DataFrames.DataFrame() : vcat(local_samples...) # Concatenate local samples and nrows together - remote_sample_value::DataFrames.DataFrame, meta_nrows_on_workers::Base.Vector{Int64} = if curr_parameters_invalid + remote_sample_value::DataFrames.DataFrame, meta_nrows_on_workers::Base.Vector{Int64} = if curr_metadata_invalid sample_and_meta_nrows_per_worker::Base.Vector{Tuple{DataFrames.DataFrame,Base.Vector{Int64}}} = gather_across((local_sample, meta_nrows_on_worker)) if is_main @@ -239,7 +254,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv # Return final Sample on main worker now that we have gathered both the sample and metadata if is_main - empty_sample_value_serialized::String = to_jl_value_contents(empty(remote_sample_value)) + empty_sample_value_serialized::String = to_arrow_string(empty(remote_sample_value)) # Convert dataframe to a buffer storing Arrow-serialized data. # Then when we receive this on the client side we can simply @@ -254,12 +269,12 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv total_nbytes_res = if exact_sample_needed remote_sample_value_memory_usage else - ceil(Int64, remote_sample_value_memory_usage * session_sample_rate) + ceil(Int64, remote_sample_value_memory_usage * sample_rate) end remote_sample_value_nrows = nrow(remote_sample_value) if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE @show total_nrows_res remote_sample_value_nrows - @show remote_sample_value_memory_usage total_nbytes_res session_sample_rate + @show remote_sample_value_memory_usage total_nbytes_res sample_rate end remote_sample_res::Sample = if exact_sample_needed # Technically we don't need to be passing in `total_bytes_res` @@ -269,11 +284,11 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv # constructors ExactSample(remote_sample_value_arrow, total_nbytes_res) else - Sample(remote_sample_value_arrow, total_nbytes_res) + Sample(remote_sample_value_arrow, total_nbytes_res, sample_rate) end meta_nrows_on_workers, total_nrows_res, total_nbytes_res, remote_sample_res, empty_sample_value_serialized else - Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_jl_value_contents(DataFrames.DataFrame()) + Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_arrow_string(DataFrames.DataFrame()) end else # This case is entered if we the format has metadata stored @@ -287,38 +302,72 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv if is_main meta_nrows_res::Base.Vector{Int64} = vcat(meta_nrows_per_worker...) - # Get the total # of bytes - cached_remote_sample_res::Sample = curr_location.sample - remote_sample_value_nrows = nrow(cached_remote_sample_res.value) - remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_res.value) - if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE - @show remote_sample_value_nbytes remote_sample_value_nrows total_nrows_res - end + # # Get the total # of bytes + # cached_remote_sample_res = Sample( + # DataFrames.DataFrame(Arrow.Table("s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)")), + # sample_rate + # ) + # remote_sample_value_nrows = nrow(cached_remote_sample_res.value) + # remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_res.value) + # if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE + # @show remote_sample_value_nbytes remote_sample_value_nrows total_nrows_res + # end + # total_nbytes_res = ceil(Int64, remote_sample_value_nbytes * total_nrows_res / remote_sample_value_nrows) + + # # Update the sample's sample rate and memory usage based on the + # # new # of rows (since the metadata with info about # of rows + # # has been invalidated) + # cached_remote_sample_res.rate = ceil(Int64, total_nrows_res / remote_sample_value_nrows) + # cached_remote_sample_res.memory_usage = ceil(Int64, total_nbytes_res / cached_remote_sample_res.rate)::Int64 + # if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE + # @show sample_rate total_nbytes_res cached_remote_sample_res.memory_usage + # end + + cached_remote_sample_value = DataFrames.DataFrame(Arrow.Table(sample_path)) + remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_value) + remote_sample_value_nrows = DataFrames.nrow(cached_remote_sample_value) total_nbytes_res = ceil(Int64, remote_sample_value_nbytes * total_nrows_res / remote_sample_value_nrows) + cached_remote_sample_res = NOTHING_SAMPLE - # Update the sample's sample rate and memory usage based on the - # new # of rows (since the metadata with info about # of rows - # has been invalidated) - cached_remote_sample_res.rate = ceil(Int64, total_nrows_res / remote_sample_value_nrows) - cached_remote_sample_res.memory_usage = ceil(Int64, total_nbytes_res / cached_remote_sample_res.rate)::Int64 - if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE - @show cached_remote_sample_res.rate total_nbytes_res cached_remote_sample_res.memory_usage - end - - meta_nrows_res, total_nrows_res, total_nbytes_res, cached_remote_sample_res, curr_location.src_parameters["empty_sample"] + meta_nrows_res, total_nrows_res, total_nbytes_res, cached_remote_sample_res, loc.src_parameters["empty_sample"] else - Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_jl_value_contents(DataFrames.DataFrame()) + Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_arrow_string(DataFrames.DataFrame()) end end # If a file does not exist, one of the get_metadata/get_sample functions # will error. - # Write the metadata to an Arrow file - meta_path = is_main ? get_meta_path(remotepath) : "" - if curr_parameters_invalid + # Get source parameters + src_params = + Dict( + "name" => "Remote", + "total_memory_usage" => string(total_nbytes), + # For dispatching the appropriate PF for this format + "format" => format_string, + # For constructing the `BanyanDataFrames.DataFrame`'s `nrows::Future` field + "nrows" => string(total_nrows), + # For diagnostics purposes in PFs (partitioning functions) + "path" => remotepath, + # For PFs to read from this source + # TODO + "empty_sample" => empty_sample + ) + + # Write the metadata to S3 cache if previously invalid + if curr_metadata_invalid # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write` - Arrow.write(is_main ? meta_path : IOBuffer(), (path=remotepaths, nrows=meta_nrows), compress=:zstd) + Arrow.write( + is_main ? metadata_path : IOBuffer(), + (path=remotepaths, nrows=meta_nrows); + compress=:zstd, + metadata=src_params + ) + end + + # Write the sample to S3 cache if previously invalid + if curr_sample_invalid + write(sample_path, remote_sample.value.data) end if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND @@ -327,54 +376,73 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)") - # Return LocationSource + # Return LocationSource to client side if is_main # Construct the `Location` to return if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE @show total_nbytes end - location_res = LocationSource( + LocationSource( "Remote", - Dict( - # For dispatching the appropriate PF for this format - "format" => format_string, - # For constructing the `BanyanDataFrames.DataFrame`'s `nrows::Future` field - "nrows" => total_nrows, - # For diagnostics purposes in PFs (partitioning functions) - "path" => remotepath, - # For location constructor to use as caching - "meta_path" => meta_path, - # For PFs to read from this source - "empty_sample" => empty_sample - ), + src_params, total_nbytes, remote_sample ) - - # Write out the updated `Location` - cache_location(remotepath, location_res, invalidate_sample, invalidate_metadata) - - location_res else NOTHING_LOCATION end end -RemoteTableSource(remotepath; shuffled=true, metadata_invalid = false, sample_invalid = false, invalidate_metadata = false, invalidate_sample = false, max_exact_sample_length = Banyan.get_max_exact_sample_length())::Location = - let loc = offloaded( - _remote_table_source, - remotepath, - shuffled, - metadata_invalid, - sample_invalid, - invalidate_metadata, - invalidate_sample, - max_exact_sample_length; - distributed=true - ) - loc.sample.value = loc.sample.value |> seekstart |> Arrow.Table |> DataFrames.DataFrame +load_arrow_sample(f) = f |> Arrow.Table |> DataFrames.DataFrame +load_arrow_sample_from_buf(iobuf) = iobuf |> seekstart |> load_arrow_sample + +# TODO: Modify offloaded function to: +# - Use get_sampling_config() to get sample rate, shuffled, max_num_bytes_exact +# - Use the passed in location to get info about validity of metdata and samples +# - Use the passed in location to avoid reading from S3 to get the location +# - Use the LocationPath to get_sample_rate properly here and elsewhere +# - Write sample file and metadata file to S3 if needed +# - Parse string values of location metadata +# - Keep empty_sample but make it be a string of Arrow data with a to/from_arrow_value +# - Return location with sample and metadata + +function RemoteTableSource(remotepath)::Location + lp = LocationPath(remotepath, "arrow", "2") + + # Look at local and S3 caches of metadata and samples to attempt to + # construct a Location. + loc, local_metadata_path, local_sample_path = get_location_source(lp) + + if !loc.metadata_invalid && !loc.sample_invalid + # Case where both sample and parameters are valid + loc.sample.value = load_arrow_sample(local_sample_path) loc + elseif loc.metadata_invalid && !loc.sample_invalid + # Case where parameters are invalid + new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) + Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) + new_loc.sample.value = load_arrow_sample(local_sample_path) + new_loc + else + # Case where sample is invalid + + # Get the Location with up-to-date metadata (source parameters) and sample + new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) + + if !loc.metadata_invalid + # Store the metadata locally. The local copy just has the source + # parameters but PFs can still access the S3 copy which will have the + # table of file names and #s of rows. + Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) + end + + # Store the Arrow sample locally and update the returned Sample + write(local_sample_path, new_loc.sample.value.data) + new_loc.sample.value = load_arrow_sample_from_buf(new_loc.sample.value) + + new_loc end +end # Load metadata for writing # NOTE: `remotepath` should end with `.parquet` or `.csv` if Parquet diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index 98027383..0c877289 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -196,13 +196,14 @@ function ReadBlockHelper(@nospecialize(format_value)) end loc_params_path = loc_params[symbol_path]::String + lp = LocationPath(loc_params_path, "arrow", "2") balanced = params[symbol_balanced] - m_path = loc_name == symbol_Disk ? sync_across(is_main_worker(comm) ? get_meta_path(loc_params_path) : "", comm=comm) : loc_params["meta_path"]::String - loc_params = loc_name == symbol_Disk ? (Banyan.deserialize_retry(get_location_path(loc_params_path))::Location).src_parameters : loc_params + m_path = "s3/$(banyan_metadata_bucket_name())/$(Banyan.get_metadata_path(lp))" + loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND @show (m_path, loc_params, get_worker_idx()) end - meta = Arrow_Table_retry(m_path) + # meta = Arrow_Table_retry(m_path) filtering_op = get(params, symbol_filtering_op, identity) # Handle multi-file tabular datasets @@ -216,8 +217,8 @@ function ReadBlockHelper(@nospecialize(format_value)) # [1] https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing # Initialize - meta_nrows = meta.nrows - meta_path = meta.path + meta_nrows = loc_params["nrows"] + meta_path = loc_params["path"] nworkers = get_nworkers(comm) npartitions = nbatches * nworkers partition_idx = get_partition_idx(batch_idx, nbatches, comm) @@ -359,7 +360,7 @@ function ReadBlockHelper(@nospecialize(format_value)) res = if isempty(dfs) # When we construct the location, we store an empty data frame with The # correct schema. - from_jl_value_contents(loc_params["empty_sample"]) + from_arrow_string(loc_params["empty_sample"]) elseif length(dfs) == 1 dfs[1] else @@ -398,6 +399,7 @@ function WriteHelper(@nospecialize(format_value)) # Get path of directory to write to is_disk = loc_name == "Disk" loc_params_path = loc_params["path"]::String + lp = LocationPath(loc_params_path, "arrow", "2") path::String = loc_params_path if startswith(path, "http://") || startswith(path, "https://") error("Writing to http(s):// is not supported") @@ -483,9 +485,13 @@ function WriteHelper(@nospecialize(format_value)) # Get paths for reading in metadata and Location tmp_suffix = nbatches > 1 ? ".tmp" : "" - m_path = is_main ? get_meta_path(loc_params_path * tmp_suffix) : "" - location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : "" - m_path, location_path = sync_across((m_path, location_path), comm=comm) + lp_tmp = LocationPath(loc_params_path * tmp_suffix, "arrow", "2") + # m_path = is_main ? get_meta_path() : "" + # location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : "" + # m_path, location_path = sync_across((m_path, location_path), comm=comm) + m_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp_tmp))" + s_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp_tmp))$sample_rate" + # loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params # Read in meta path if it's there curr_remotepaths, curr_nrows = if nbatches > 1 && batch_idx > 1 @@ -498,31 +504,34 @@ function WriteHelper(@nospecialize(format_value)) # Read in the current location if it's there empty_df = DataFrames.DataFrame() - curr_location::Location = if nbatches > 1 && batch_idx > 1 - Banyan.deserialize_retry(location_path) + curr_metadata_tbl = if nbatches > 1 && batch_idx > 1 + Arrow.Table(m_path) else - LocationSource( - "Remote", - Dict( - "format" => format_string, - "nrows" => 0, - "path" => loc_params_path, - "meta_path" => m_path, - "empty_sample" => to_jl_value_contents(empty_df) - ), - 0, - ExactSample(empty_df, 0) + Arrow.Table() + end + curr_src_parameters = if nbatches > 1 && batch_idx > 1 + Dict{String,String}(Arrow.getmetadata(curr_metadata_tbl)) + else + Dict( + "name" => "Remote", + "total_memory_usage" => "0", + "format" => format_string, + "nrows" => "0", + "path" => loc_params_path, + "empty_sample" => to_arrow_string(empty_df), ) end # Gather # of rows, # of bytes, empty sample, and actual sample nbytes = part_res isa Empty ? 0 : Banyan.total_memory_usage(part_res) - sample_rate = get_sample_rate() + sampling_config = get_sampling_config(lp) + sample_rate = sampling_config.rate sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows) gathered_data = gather_across((nrows, nbytes, part_res isa Empty ? part_res : empty(part_res), sampled_part), comm) # On the main worker, finalize metadata and location info. + sample_invalid = false if is_main # Determine paths and #s of rows for metadata file for worker_i in 1:nworkers @@ -538,48 +547,43 @@ function WriteHelper(@nospecialize(format_value)) end # Update the # of bytes - total_nrows::Int64 = curr_location.src_parameters["nrows"] + total_nrows::Int64 = parse(Int64, curr_src_parameters["nrows"]) + total_memory_usage::Int64 = parse(Int64, curr_src_parameters["total_memory_usage"]) empty_sample_found = false for (new_nrows::Int64, new_nbytes::Int64, empty_part, sampled_part) in gathered_data # Update the total # of rows and the total # of bytes total_nrows += sum(new_nrows) push!(curr_nrows, new_nrows) - curr_location.total_memory_usage += new_nbytes + total_memory_usage += new_nbytes # Get the empty sample if !empty_sample_found && !(empty_part isa Empty) - curr_location.src_parameters["empty_sample"] = to_jl_value_contents(empty_part) + curr_src_parameters["empty_sample"] = to_arrow_string(empty_part) empty_sample_found = true end end - curr_location.src_parameters["nrows"] = total_nrows + curr_src_parameters["nrows"] = string(total_nrows) + curr_src_parameters["total_memory_usage"] = string(total_memory_usage) + + if !is_disk && batch_idx == nbatches && total_memory_usage <= sampling_config.max_num_bytes_exact + # If the total # of rows turns out to be inexact then we can simply mark it as + # stale so that it can be collected more efficiently later on + # We should be able to quickly recompute a more useful sample later + # on when we need to use this location. + sample_invalid = true + end # Get the actual sample by concatenating - curr_location.sample = if is_disk - Sample() - else + if !is_disk && !sample_invalid sampled_parts = [gathered[4] for gathered in gathered_data] if batch_idx > 1 push!(sampled_parts, curr_location.sample.value |> seekstart |> Arrow.Table |> DataFrames.DataFrame) end - new_sample_value_arrow = IOBuffer() - Arrow.write(new_sample_value_arrow, vcat(sampled_parts...), compress=:zstd) - Sample(new_sample_value_arrow, curr_location.total_memory_usage) + Arrow.write(s_path, vcat(sampled_parts...), compress=:zstd) end # Determine paths for this batch and gather # of rows - Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows), compress=:zstd) - - if !is_disk && batch_idx == nbatches && total_nrows <= get_max_exact_sample_length() - # If the total # of rows turns out to be inexact then we can simply mark it as - # stale so that it can be collected more efficiently later on - # We should be able to quickly recompute a more useful sample later - # on when we need to use this location. - curr_location.sample_invalid = true - end - - # Write out the updated `Location` - serialize(location_path, curr_location) + Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows); compress=:zstd, metadata=curr_src_parameters) end ################################### @@ -588,11 +592,11 @@ function WriteHelper(@nospecialize(format_value)) if nbatches > 1 && batch_idx == nbatches # Copy over location and meta path - actual_meta_path = get_meta_path(loc_params_path) - actual_location_path = get_location_path(loc_params_path) + actual_meta_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" + actual_sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate" if worker_idx == 1 cp(m_path, actual_meta_path, force=true) - cp(location_path, actual_location_path, force=true) + cp(s_path, actual_sample_path, force=true) end # Copy over files to actual location diff --git a/BanyanDataFrames/src/utils_pfs.jl b/BanyanDataFrames/src/utils_pfs.jl index 5ce0112d..1839e606 100644 --- a/BanyanDataFrames/src/utils_pfs.jl +++ b/BanyanDataFrames/src/utils_pfs.jl @@ -1,3 +1,12 @@ +function to_arrow_string(df::DataFrames.DataFrame)::String + io = IOBuffer() + Arrow.write(io, df) + base64encode(seekstart(io)) +end + +from_arrow_string(s::String)::DataFrames.DataFrame = + s |> base64decode |> Arrow.Table |> DataFrames.DataFrame + const AnyDataFrame = Union{ DataFrames.DataFrame, SubDataFrame{DataFrames.DataFrame, DataFrames.Index, Base.Vector{Int64}}, diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index dff0e175..1634501e 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -35,12 +35,17 @@ # Construct location if reusing != "nothing" RemoteTableSource(src_name, invalidate_metadata = true, invalidate_sample = true) + invalidate_location(src_name) RemoteTableSource(src_name, metadata_invalid = true, sample_invalid = true) end + if (reusing == "nothing" || reusing == "sample") + invalidate_metadata(src_name) + end + if (reusing == "nothing" || reusing == "location") + invalidate_sample(src_name) + end remote_source = RemoteTableSource( src_name, - metadata_invalid = (reusing == "nothing" || reusing == "sample"), - sample_invalid = (reusing == "nothing" || reusing == "location"), shuffled = with_or_without_shuffled == "with", max_exact_sample_length = max_exact_sample_length ) @@ -48,7 +53,7 @@ # Verify the location @test remote_source.total_memory_usage > 0 - @test !remote_source.parameters_invalid + @test !remote_source.metadata_invalid @test !remote_source.sample_invalid @test remote_source.src_parameters["nrows"] == src_nrows # if contains(src_name, "dir") diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 537bf573..75f7d5e9 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -33,8 +33,8 @@ function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sampl is_main = worker_idx == 1 # Get current location - curr_location, curr_sample_invalid, curr_parameters_invalid = get_cached_location(path_and_subpath, metadata_invalid, sample_invalid) - if !curr_parameters_invalid && !curr_sample_invalid + curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location(path_and_subpath, metadata_invalid, sample_invalid) + if !curr_metadata_invalid && !curr_sample_invalid return curr_location end diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index eb244b33..f1ab4144 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -173,7 +173,7 @@ # # Serialize generator # if isnothing(remote_source) -# files = remotepath isa Tuple ? Banyan.to_jl_value_contents(remotepath) : files_to_read_from +# files = remotepath isa Tuple ? Banyan.to_jl_string(remotepath) : files_to_read_from # end # empty_part_size = (0, (datasize[2:end])...) @@ -188,7 +188,7 @@ # "ndims" => ndims, # "size" => datasize, # "eltype" => dataeltype, -# "emptysample" => to_jl_value_contents(Base.Array{dataeltype}(undef, empty_part_size)), +# "emptysample" => to_jl_string(Base.Array{dataeltype}(undef, empty_part_size)), # "format" => format, # "add_channelview" => add_channelview # ), @@ -287,8 +287,8 @@ function _remote_image_source( is_main = worker_idx == 1 # Get current location - curr_location, curr_sample_invalid, curr_parameters_invalid = get_cached_location((remotepath, add_channelview), remotepath_id, metadata_invalid, sample_invalid) - if !curr_parameters_invalid && !curr_sample_invalid + curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location((remotepath, add_channelview), remotepath_id, metadata_invalid, sample_invalid) + if !curr_metadata_invalid && !curr_sample_invalid return curr_location end @@ -302,12 +302,12 @@ function _remote_image_source( # other is each iterated element and return a single path # Iterable object that iterates over local paths - meta_path = if !curr_parameters_invalid + meta_path = if !curr_metadata_invalid curr_location.src_parameters["meta_path"]::String else is_main ? get_meta_path((remotepath, add_channelview), remotepath_id) : "" end - if is_main && curr_parameters_invalid + if is_main && curr_metadata_invalid localpaths::Base.Vector{String} = getpaths(remotepath) Arrow.write(meta_path, (path=localpaths,)) end @@ -361,7 +361,7 @@ function _remote_image_source( # Construct location with metadata location_res = LocationSource( "Remote", - if curr_parameters_invalid + if curr_metadata_invalid empty_part_size = (0, (datasize_res[2:end])...) Dict{String,Any}( "meta_path" => meta_path, @@ -370,7 +370,7 @@ function _remote_image_source( "ndims" => ndims_res, "size" => datasize_res, "eltype" => dataeltype_res, - "empty_sample" => to_jl_value_contents(Base.Array{dataeltype_res}(undef, empty_part_size)), + "empty_sample" => to_arrow_string(Base.Array{dataeltype_res}(undef, empty_part_size)), "add_channelview" => add_channelview, "format" => "image" ) diff --git a/BanyanImages/src/pfs.jl b/BanyanImages/src/pfs.jl index 97d7afe0..c8edf0e8 100644 --- a/BanyanImages/src/pfs.jl +++ b/BanyanImages/src/pfs.jl @@ -66,7 +66,7 @@ ReadBlockImage( loc_params["meta_path"]::String, loc_params["nimages"]::Int64, loc_params["size"], - Banyan.from_jl_value_contents(loc_params["empty_sample"]::String), + Banyan.from_jl_string(loc_params["empty_sample"]::String), loc_params["add_channelview"] ) diff --git a/BanyanImages/test/pfs.jl b/BanyanImages/test/pfs.jl index ecaa05e8..c53994e6 100644 --- a/BanyanImages/test/pfs.jl +++ b/BanyanImages/test/pfs.jl @@ -33,7 +33,7 @@ # datasize = add_channelview ? (nimages, 3, 100, 100) : (nimages, 100, 100) # empty_part_size = add_channelview ? (0, 3, 100, 100) : (0, 100, 100) # elseif format == "generator" -# files = Banyan.to_jl_value_contents(path) +# files = Banyan.to_jl_string(path) # datasize = add_channelview ? (nimages, 3, 512, 512) : (nimages, 512, 512) # empty_part_size = add_channelview ? (0, 3, 512, 512) : (0, 512, 512) # elseif format == "path" @@ -66,7 +66,7 @@ # "ndims" => 3, # "size" => datasize, # Inaccurate value # "eltype" => dataeltype, -# "empty_sample" => Banyan.to_jl_value_contents(Base.Array{dataeltype}(undef, empty_part_size)), +# "empty_sample" => Banyan.to_jl_string(Base.Array{dataeltype}(undef, empty_part_size)), # "format" => filetype, # "add_channelview" => add_channelview # ), From d5c11b38bc29ccbe196fbabf0ad90e7504650854 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Sun, 7 Aug 2022 14:49:41 -0700 Subject: [PATCH 05/25] Refactor RemoteTableSource into RemoteSource --- Banyan/src/Banyan.jl | 2 +- Banyan/src/locations.jl | 46 ++++++++++++++++ BanyanDataFrames/src/locations.jl | 89 +++++++++++-------------------- BanyanDataFrames/src/pfs.jl | 11 ++-- 4 files changed, 83 insertions(+), 65 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index f51bad16..2550c4db 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -92,7 +92,7 @@ export SamplingConfig # Locations export Location, LocationSource, LocationDestination, located, sourced, destined -export Value, Size, Client, Disk, None +export Value, Size, Client, Disk, None, RemoteSource export invalidate_all_locations, invalidate_metadata, invalidate_sample export NOTHING_LOCATION, INVALID_LOCATION export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index f1564ed5..670f1d5c 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -427,4 +427,50 @@ end function get_sample_and_metadata(::Val{:jl}, p, sample_rate) data = deserialize_retry(p) get_sample_from_data(data, sample_rate, size(data, 1)), size(data, 1) +end + +function RemoteSource( + lp::LocationPath, + _remote_source::Function, + load_sample::Function, + load_sample_from_blob::Function, + write_sample::Function +)::Location + # _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location + # load_sample accepts a file path + # load_sample_from_blob accepts an array of bytes + + # Look at local and S3 caches of metadata and samples to attempt to + # construct a Location. + loc, local_metadata_path, local_sample_path = get_location_source(lp) + + if !loc.metadata_invalid && !loc.sample_invalid + # Case where both sample and parameters are valid + loc.sample.value = load_sample(local_sample_path) + loc + elseif loc.metadata_invalid && !loc.sample_invalid + # Case where parameters are invalid + new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) + Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) + new_loc.sample.value = load_sample(local_sample_path) + new_loc + else + # Case where sample is invalid + + # Get the Location with up-to-date metadata (source parameters) and sample + new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) + + if !loc.metadata_invalid + # Store the metadata locally. The local copy just has the source + # parameters but PFs can still access the S3 copy which will have the + # table of file names and #s of rows. + Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) + end + + # Store the Arrow sample locally and update the returned Sample + write_sample(local_sample_path, new_loc.sample.value) + new_loc.sample.value = load_sample_from_blob(new_loc.sample.value) + + new_loc + end end \ No newline at end of file diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 36610a09..e29b73ee 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -262,7 +262,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6 # latency for retrieving metadata/samples for BDF.jl. io = IOBuffer() Arrow.write(io, remote_sample_value, compress=:zstd) - remote_sample_value_arrow = io + remote_sample_value_arrow = io.data # Construct Sample with the concatenated value, memory usage, and sample rate remote_sample_value_memory_usage = total_memory_usage(remote_sample_value) @@ -354,30 +354,31 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6 "empty_sample" => empty_sample ) - # Write the metadata to S3 cache if previously invalid - if curr_metadata_invalid - # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write` - Arrow.write( - is_main ? metadata_path : IOBuffer(), - (path=remotepaths, nrows=meta_nrows); - compress=:zstd, - metadata=src_params - ) - end + if is_main + # Write the metadata to S3 cache if previously invalid + if curr_metadata_invalid + # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write` + Arrow.write( + is_main ? metadata_path : IOBuffer(), + (path=remotepaths, nrows=meta_nrows); + compress=:zstd, + metadata=src_params + ) + end - # Write the sample to S3 cache if previously invalid - if curr_sample_invalid - write(sample_path, remote_sample.value.data) - end + # Write the sample to S3 cache if previously invalid + if curr_sample_invalid + write(sample_path, remote_sample.value.data) + end - if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND - @show (remotepath, meta_path) - end + if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND + @show (remotepath, meta_path) + end - # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)") + # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)") + + # Return LocationSource to client specified - # Return LocationSource to client side - if is_main # Construct the `Location` to return if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE @show total_nbytes @@ -394,7 +395,6 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6 end load_arrow_sample(f) = f |> Arrow.Table |> DataFrames.DataFrame -load_arrow_sample_from_buf(iobuf) = iobuf |> seekstart |> load_arrow_sample # TODO: Modify offloaded function to: # - Use get_sampling_config() to get sample rate, shuffled, max_num_bytes_exact @@ -406,43 +406,14 @@ load_arrow_sample_from_buf(iobuf) = iobuf |> seekstart |> load_arrow_sample # - Keep empty_sample but make it be a string of Arrow data with a to/from_arrow_value # - Return location with sample and metadata -function RemoteTableSource(remotepath)::Location - lp = LocationPath(remotepath, "arrow", "2") - - # Look at local and S3 caches of metadata and samples to attempt to - # construct a Location. - loc, local_metadata_path, local_sample_path = get_location_source(lp) - - if !loc.metadata_invalid && !loc.sample_invalid - # Case where both sample and parameters are valid - loc.sample.value = load_arrow_sample(local_sample_path) - loc - elseif loc.metadata_invalid && !loc.sample_invalid - # Case where parameters are invalid - new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) - Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) - new_loc.sample.value = load_arrow_sample(local_sample_path) - new_loc - else - # Case where sample is invalid - - # Get the Location with up-to-date metadata (source parameters) and sample - new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) - - if !loc.metadata_invalid - # Store the metadata locally. The local copy just has the source - # parameters but PFs can still access the S3 copy which will have the - # table of file names and #s of rows. - Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) - end - - # Store the Arrow sample locally and update the returned Sample - write(local_sample_path, new_loc.sample.value.data) - new_loc.sample.value = load_arrow_sample_from_buf(new_loc.sample.value) - - new_loc - end -end +RemoteTableSource(remotepath)::Location = + RemoteSource( + LocationPath(remotepath, "arrow", "2"), + _remote_table_source, + load_arrow_sample, + load_arrow_sample, + write + ) # Load metadata for writing # NOTE: `remotepath` should end with `.parquet` or `.csv` if Parquet diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index 0c877289..aa4f453c 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -199,7 +199,8 @@ function ReadBlockHelper(@nospecialize(format_value)) lp = LocationPath(loc_params_path, "arrow", "2") balanced = params[symbol_balanced] m_path = "s3/$(banyan_metadata_bucket_name())/$(Banyan.get_metadata_path(lp))" - loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params + m_tbl = Arrow_Table_retry(m_path) + loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(m_tbl)) : loc_params if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND @show (m_path, loc_params, get_worker_idx()) end @@ -217,12 +218,12 @@ function ReadBlockHelper(@nospecialize(format_value)) # [1] https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing # Initialize - meta_nrows = loc_params["nrows"] - meta_path = loc_params["path"] + meta_nrows = m_tbl.nrows + meta_path = m_tbl.path nworkers = get_nworkers(comm) npartitions = nbatches * nworkers partition_idx = get_partition_idx(batch_idx, nbatches, comm) - nrows::Int64 = loc_params[symbol_nrows]::Int64 + nrows::Int64 = meta_nrows rows_per_partition = cld(nrows, npartitions) sorting_perm = sortperm(meta_nrows, rev=true) files_by_partition = Base.Vector{Int64}[] @@ -312,7 +313,7 @@ function ReadBlockHelper(@nospecialize(format_value)) ndfs = 0 rowsscanned = 0 files_to_read = [] - for file in Tables.rows(meta) + for file in Tables.rows(m_tbl) path = file[1] path_nrows = file[2] newrowsscanned = rowsscanned + path_nrows From 347d1f8f8789ccd42db236a2a2adfe33aca09cad Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Sun, 7 Aug 2022 16:20:46 -0700 Subject: [PATCH 06/25] Modify BanyanHDF5.jl to use new sample caching system --- Banyan/src/location.jl | 2 + Banyan/src/locations.jl | 14 +++--- BanyanDataFrames/src/df.jl | 2 +- BanyanDataFrames/src/locations.jl | 7 ++- BanyanHDF5/src/hdf5.jl | 5 +-- BanyanHDF5/src/locations.jl | 72 +++++++++++++++--------------- BanyanONNXRunTime/src/locations.jl | 1 - 7 files changed, 54 insertions(+), 49 deletions(-) diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 93e41110..457ed3b1 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -62,6 +62,8 @@ struct LocationPath format_version ) end + + LocationPath(path) = LocationPath(path, "jl", get_julia_version())`` end global TABLE_FORMATS = ["csv", "parquet", "arrow"] diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 670f1d5c..8379a0e5 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -433,16 +433,20 @@ function RemoteSource( lp::LocationPath, _remote_source::Function, load_sample::Function, - load_sample_from_blob::Function, + load_sample_after_offloaded::Function, write_sample::Function )::Location # _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location # load_sample accepts a file path - # load_sample_from_blob accepts an array of bytes + # load_sample_after_offloaded accepts the sampled value returned by the offloaded function + # (for BDF.jl, this is an Arrow blob of bytes that needs to be converted into an actual + # dataframe once sent to the client side) # Look at local and S3 caches of metadata and samples to attempt to # construct a Location. loc, local_metadata_path, local_sample_path = get_location_source(lp) + sc = get_sampling_config(lp) + sc.rate = parse_sample_rate(local_sample_path) if !loc.metadata_invalid && !loc.sample_invalid # Case where both sample and parameters are valid @@ -450,7 +454,7 @@ function RemoteSource( loc elseif loc.metadata_invalid && !loc.sample_invalid # Case where parameters are invalid - new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) + new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true) Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) new_loc.sample.value = load_sample(local_sample_path) new_loc @@ -458,7 +462,7 @@ function RemoteSource( # Case where sample is invalid # Get the Location with up-to-date metadata (source parameters) and sample - new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true) + new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true) if !loc.metadata_invalid # Store the metadata locally. The local copy just has the source @@ -469,7 +473,7 @@ function RemoteSource( # Store the Arrow sample locally and update the returned Sample write_sample(local_sample_path, new_loc.sample.value) - new_loc.sample.value = load_sample_from_blob(new_loc.sample.value) + new_loc.sample.value = load_sample_after_offloaded(new_loc.sample.value) new_loc end diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl index 9ccede94..485a0d12 100644 --- a/BanyanDataFrames/src/df.jl +++ b/BanyanDataFrames/src/df.jl @@ -51,7 +51,7 @@ function read_table(path::String; kwargs...) @nospecialize df_loc = RemoteTableSource(path; kwargs...) df_loc.src_name == "Remote" || error("$path does not exist") - df_loc_nrows::Int64 = df_loc.src_parameters["nrows"] + df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"]) df_nrows = Future(df_loc_nrows) DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows) end diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index e29b73ee..ab369514 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -2,10 +2,9 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end] Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5)) -function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location +function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::SamplingConfig)::Location # Setup for sampling remotepath = lp.path - sampling_config = get_sampling_config(lp) shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact # TODO: Replace `max_exact_sample_length` with `max_num_bytes_exact` is_main = is_main_worker() @@ -359,7 +358,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6 if curr_metadata_invalid # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write` Arrow.write( - is_main ? metadata_path : IOBuffer(), + metadata_path, (path=remotepaths, nrows=meta_nrows); compress=:zstd, metadata=src_params @@ -423,7 +422,7 @@ RemoteTableDestination(remotepath)::Location = "Remote", Dict( "format" => get_file_ending(remotepath), - "nrows" => 0, + "nrows" => "0", "path" => remotepath, ), ) \ No newline at end of file diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl index 30e20ad5..8f3d1979 100644 --- a/BanyanHDF5/src/hdf5.jl +++ b/BanyanHDF5/src/hdf5.jl @@ -2,9 +2,8 @@ function read_hdf5(path; kwargs...) A_loc = RemoteHDF5Source(path; kwargs...) A_loc.src_name == "Remote" || error("$path does not exist") A = Future(datatype="Array", source=A_loc) - A_loc_size = A_loc.src_parameters["size"] - A_loc_eltype = A_loc.src_parameters["eltype"] - A_loc_ndims = A_loc.src_parameters["ndims"] + A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"]) + A_loc_ndims = length(A_loc_size) BanyanArrays.Array{A_loc_eltype,A_loc_ndims}(A, Future(A_loc_size)) end diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 75f7d5e9..2ad32aa7 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -26,16 +26,20 @@ end HDF5_getindex_retry = retry(HDF5.getindex; delays=Base.ExponentialBackOff(; n=5)) -function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length) +function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig) + path_and_subpath = lp.path + shuffled = sc.assume_shuffled + curr_metadata_invalid = loc.metadata_invalid + curr_sample_invalid = loc.sample_invalid + # Get session information - session_sample_rate = get_sample_rate() + sample_rate = sc.rate worker_idx, nworkers = get_worker_idx(), get_nworkers() is_main = worker_idx == 1 # Get current location - curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location(path_and_subpath, metadata_invalid, sample_invalid) if !curr_metadata_invalid && !curr_sample_invalid - return curr_location + return loc end # Download the path @@ -85,8 +89,8 @@ function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sampl # Read in the sample on each worker and # aggregate and concatenate it on the main worker rand_indices_range = split_len(datalength, worker_idx, nworkers) - rand_indices = sample_from_range(rand_indices_range, session_sample_rate) - exact_sample_needed = datalength < max_exact_sample_length + rand_indices = sample_from_range(rand_indices_range, sample_rate) + exact_sample_needed = nbytes < sc.max_num_bytes_exact remaining_colons = Base.fill(Colon(), datandims-1) dset_sample_value = if !exact_sample_needed samples_on_workers = gather_across( @@ -125,48 +129,47 @@ function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sampl NOTHING_SAMPLE end else - curr_location.sample + NOTHING_SAMPLE end # Close HDF5 file close(f) if is_main - location_res = LocationSource( - "Remote", - Dict{String,Any}( - "path_and_subpath" => path_and_subpath, - "path" => remotepath, - "subpath" => datasetpath, - "size" => datasize, - "ndims" => datandims, - "eltype" => dataeltype, - "nbytes" => nbytes, - "format" => "hdf5" - ), - nbytes, - dset_sample, + # Construct parameters for Location + src_params = Dict{String,String}( + "name" => "Remote", + "path_and_subpath" => path_and_subpath, + "path" => remotepath, + "subpath" => datasetpath, + "eltype_and_size" => Banyan.to_jl_string((dataeltype, datasize)), + "total_memory_usage" => string(nbytes), + "format" => "hdf5" ) - cache_location(remotepath, location_res, invalidate_sample, invalidate_metadata) - location_res + + # Get paths to store metadata and sample in + metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" + sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)" + + # Store metadata and sample in S3 + Arrow.write(metadata_path; metadata=src_params) + serialize(sample_path, dset_sample) + + # Return Location to client side + LocationSource("Remote", src_params, nbytes, dset_sample) else INVALID_LOCATION end end -function RemoteHDF5Source(remotepath; shuffled=false, metadata_invalid = false, sample_invalid = false, invalidate_metadata = false, invalidate_sample = false, max_exact_sample_length = Banyan.get_max_exact_sample_length())::Location - offloaded( +RemoteHDF5Source(remotepath)::Location = + RemoteSource( + LocationPath(remotepath), _remote_hdf5_source, - remotepath, - shuffled, - metadata_invalid, - sample_invalid, - invalidate_metadata, - invalidate_sample, - max_exact_sample_length; - distributed=true + deserialize, + identity, + serialize ) -end function RemoteHDF5Destination(remotepath)::Location path_and_subpath = remotepath @@ -178,7 +181,6 @@ function RemoteHDF5Destination(remotepath)::Location "path" => remotepath, "subpath" => datasetpath, "path_and_subpath" => path_and_subpath, - "nbytes" => 0, "format" => "hdf5" ) ) diff --git a/BanyanONNXRunTime/src/locations.jl b/BanyanONNXRunTime/src/locations.jl index 613b7a66..812ad8f1 100644 --- a/BanyanONNXRunTime/src/locations.jl +++ b/BanyanONNXRunTime/src/locations.jl @@ -19,7 +19,6 @@ function RemoteONNXSource(remotepath)::Location loc_for_reading = "Remote" metadata_for_reading = Dict{String,Any}( "path" => remotepath, - "nbytes" => nbytes, "format" => "onnx", "datatype" => "ONNX" ) From ac9d1d15eb16d47e479ef0cc74d1904ca9605f82 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Sun, 7 Aug 2022 18:19:58 -0700 Subject: [PATCH 07/25] Modify BanyanImages.jl to use new sample caching system --- Banyan/src/locations.jl | 7 +- Banyan/src/utils.jl | 53 ++++++++++++- BanyanHDF5/src/hdf5.jl | 2 + BanyanHDF5/src/locations.jl | 27 ++++--- BanyanImages/src/image.jl | 11 +-- BanyanImages/src/locations.jl | 138 +++++++++++++++++----------------- BanyanImages/src/pfs.jl | 16 ++-- 7 files changed, 158 insertions(+), 96 deletions(-) diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 8379a0e5..8c4a0378 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -434,7 +434,8 @@ function RemoteSource( _remote_source::Function, load_sample::Function, load_sample_after_offloaded::Function, - write_sample::Function + write_sample::Function, + args... )::Location # _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location # load_sample accepts a file path @@ -454,7 +455,7 @@ function RemoteSource( loc elseif loc.metadata_invalid && !loc.sample_invalid # Case where parameters are invalid - new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true) + new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true) Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) new_loc.sample.value = load_sample(local_sample_path) new_loc @@ -462,7 +463,7 @@ function RemoteSource( # Case where sample is invalid # Get the Location with up-to-date metadata (source parameters) and sample - new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true) + new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true) if !loc.metadata_invalid # Store the metadata locally. The local copy just has the source diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index 3b02e192..6a609027 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -637,4 +637,55 @@ exponential_backoff_1s = # 0.20068919503553564 # 0.29422854986603664 # 0.4414150248213825 -# ``` \ No newline at end of file +# ```` + +invert(my_dict::AbstractDict) = Dict(value => key for (key, value) in my_dict) + +TYPE_TO_STR = + Dict{DataType,String}( + Int8 => "int8", + Int16 => "int16", + Int32 => "int32", + Int64 => "int64", + Int128 => "int128", + Float16 => "float16", + Float32 => "float32", + Float64 => "float64", + String => "str", + Bool => "bool", + ) + +STR_TO_TYPE = invert(TYPE_TO_STR) + +function type_to_str(ty::DataType)::String + global TYPE_TO_STR + if haskey(TYPE_TO_STR, ty) + TYPE_TO_STR[ty] + else + "lang_jl_" * to_jl_string(ty) + end +end + +function type_from_str(s::String) + if startswith(s, "lang_") + if startswith(s, "lang_jl_") + from_jl_string(s[4:end]) + else + error("Cannot parse type $s from non-Julia language") + end + elseif haskey(TYPE_TO_STR, s) + TYPE_TO_STR[s] + else + error("Type not supported. You may need to update to the latest version of Banyan or declare the data/sample/metadata you are accessing invalid.") + end +end + +size_to_str(sz) = join(map(string, sz), ",") +size_from_str(s) = + let sz_strs = split(s, ",") + res = Vector{Int64}(undef, length(sz_strs)) + for (i, sz_str) in enumerate(sz_strs) + res[i] = parse(Int64, sz_str) + end + Tuple(res) + end \ No newline at end of file diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl index 8f3d1979..d7f8959a 100644 --- a/BanyanHDF5/src/hdf5.jl +++ b/BanyanHDF5/src/hdf5.jl @@ -3,6 +3,8 @@ function read_hdf5(path; kwargs...) A_loc.src_name == "Remote" || error("$path does not exist") A = Future(datatype="Array", source=A_loc) A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"]) + A_loc_eltype = Banyan.type_from_str(A_loc.src_parameters["eltype"]) + A_loc_size = Banyan.size_from_str(A_loc.src_parameters["size"]) A_loc_ndims = length(A_loc_size) BanyanArrays.Array{A_loc_eltype,A_loc_ndims}(A, Future(A_loc_size)) end diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 2ad32aa7..1dc78be0 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -123,7 +123,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig if exact_sample_needed ExactSample(dset_sample_value, nbytes) else - Sample(dset_sample_value, nbytes) + Sample(dset_sample_value, nbytes, sample_rate) end else NOTHING_SAMPLE @@ -137,22 +137,27 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig if is_main # Construct parameters for Location - src_params = Dict{String,String}( - "name" => "Remote", - "path_and_subpath" => path_and_subpath, - "path" => remotepath, - "subpath" => datasetpath, - "eltype_and_size" => Banyan.to_jl_string((dataeltype, datasize)), - "total_memory_usage" => string(nbytes), - "format" => "hdf5" - ) + src_params = if curr_metadata_invalid + Dict{String,String}( + "name" => "Remote", + "path_and_subpath" => path_and_subpath, + "path" => remotepath, + "subpath" => datasetpath, + "eltype" => Banyan.size_to_str(dataszie), + "size" => Banyan.type_to_str(dataeltype), + "total_memory_usage" => string(nbytes), + "format" => "hdf5" + ) + else + loc.src_parameters + end # Get paths to store metadata and sample in metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)" # Store metadata and sample in S3 - Arrow.write(metadata_path; metadata=src_params) + Arrow.write(metadata_path, Arrow.Table(); metadata=src_params) serialize(sample_path, dset_sample) # Return Location to client side diff --git a/BanyanImages/src/image.jl b/BanyanImages/src/image.jl index 34f63b5c..77e4cd1a 100644 --- a/BanyanImages/src/image.jl +++ b/BanyanImages/src/image.jl @@ -1,10 +1,11 @@ -function read_png(path; kwargs...) - image_loc = RemoteImageSource(path; kwargs...) +function read_png(path; add_channelview=false) + image_loc = RemoteImageSource(path, add_channelview) image_loc.src_name == "Remote" || error("$path does not exist") image = Future(;source=image_loc, datatype="Array") - image_loc_eltype = image_loc.src_parameters["eltype"] - image_loc_ndims = image_loc.src_parameters["ndims"] - BanyanArrays.Array{image_loc_eltype,image_loc_ndims}(image, Future(image_loc.src_parameters["size"])) + image_loc_eltype = type_from_str(image_loc.src_parameters["eltype"]) + image_loc_size = size_from_str(image_loc.src_parameters["size"]) + image_loc_ndims = length(image_loc_size) + BanyanArrays.Array{image_loc_eltype,image_loc_ndims}(image, Future(image_loc_size)) end read_jpg(p; kwargs...) = read_png(p; kwargs...) diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index f1ab4144..4d084f2a 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -272,24 +272,17 @@ _load_image_and_add_channelview(path_on_worker::String) = load_retry(path_on_wor _reshape_image(image) = reshape(image, (1, size(image)...)) -function _remote_image_source( - remotepath, - remotepath_id, - metadata_invalid, - sample_invalid, - invalidate_metadata, - invalidate_sample, - add_channelview -) +function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfig, remotepath, add_channelview::Bool) + curr_sample_invalid = loc.sample_invalid + curr_metadata_invalid = loc.metadata_invalid + # Get session information - session_sample_rate = get_sample_rate() worker_idx, nworkers = get_worker_idx(), get_nworkers() is_main = worker_idx == 1 # Get current location - curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location((remotepath, add_channelview), remotepath_id, metadata_invalid, sample_invalid) if !curr_metadata_invalid && !curr_sample_invalid - return curr_location + return loc end # Remote path is either @@ -301,28 +294,23 @@ function _remote_image_source( # that operates on two arguments where one is the object and the # other is each iterated element and return a single path - # Iterable object that iterates over local paths - meta_path = if !curr_metadata_invalid - curr_location.src_parameters["meta_path"]::String - else - is_main ? get_meta_path((remotepath, add_channelview), remotepath_id) : "" - end - if is_main && curr_metadata_invalid - localpaths::Base.Vector{String} = getpaths(remotepath) - Arrow.write(meta_path, (path=localpaths,)) - end - meta_path = sync_across(meta_path) + # Get paths to store metadata and sample in + metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" + sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$(sc.rate))" - # Load in the metadata and get the # of images - meta_table = Arrow_Table_retry(meta_path) - nimages = Tables.rowcount(meta_table) + # Iterable object that iterates over local paths + localpaths = curr_metadata_invalid ? getpaths(remotepath) : Arrow.Table(metadata_path).path + nimages = length(localpaths) # Read in images on each worker. We need to read in at least one image # regardless of whether we want to get the sample or the metadata - exact_sample_needed = nimages < 10 + _load_img = add_channelview ? _load_image_and_add_channelview : _load_image + first_img = is_main ? (localpaths[1] |> _load_img |> _reshape_image) : nothing + exact_sample_needed = is_main ? ((total_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false + exact_sample_needed = sync_across(exact_sample_needed) need_to_parallelize = nimages >= 10 total_num_images_to_read_in = if curr_sample_invalid - exact_sample_needed ? nimages : cld(nimages, session_sample_rate) + exact_sample_needed ? nimages : cld(nimages, sc.rate) else # We still have to read in an image even if we have a valid sample # because to get the metadata we need at least one image. @@ -332,9 +320,21 @@ function _remote_image_source( # If we don't need to paralellize then we are only reading on the main # worker amd we don't gather across. images_range_on_worker = need_to_parallelize ? split_len(total_num_images_to_read_in, worker_idx, nworkers) : 1:1 - paths_on_worker = map(getpath, meta_table.path[images_range_on_worker]) - images = map(add_channelview ? _load_image_and_add_channelview : _load_image, paths_on_worker) - sample_on_worker = map(_reshape_image, images) + first_img_usable = false + if images_range_on_worker.start == 1 && !isnothing(first_img) + first_img_usable = true + images_range_on_worker = 2:(images_range_on_worker.stop) + end + sample_on_worker = if length(images_range_on_worker) > 0 + paths_on_worker = map(getpath, localpaths[images_range_on_worker]) + images = map(_load_img, paths_on_worker) + map(_reshape_image, images) + else + [] + end + if first_img_usable + push!(sample_on_worker, first_img) + end # sample_on_worker is an array of images need_to_parallelize ? gather_across(sample_on_worker) : [sample_on_worker] # result is an array of arrays of images @@ -348,58 +348,62 @@ function _remote_image_source( # though if we only need the sample we don't technically need the # metadata) remote_sample_value = cat(vcat(samples_on_workers...)..., dims=1) - ndims_res = ndims(remote_sample_value) dataeltype_res = eltype(remote_sample_value) nbytes_res = cld(length(remote_sample_value) * sizeof(dataeltype_res) * nimages, total_num_images_to_read_in) datasize_res = indexapply(nimages, size(remote_sample_value), 1) remote_sample = if curr_sample_invalid - exact_sample_needed ? ExactSample(remote_sample_value, nbytes_res) : Sample(remote_sample_value, nbytes_res) + if exact_sample_needed + ExactSample(remote_sample_value, nbytes_res) + else + Sample(remote_sample_value, nbytes_res, sc.rate) + end else - curr_location.sample + NOTHING_SAMPLE + end + + src_parameters = if curr_metadata_invalid + Dict{String,Any}( + "name" => "Remote", + "nimages" => string(nimages), + "total_memory_usage" => string(nbytes_res), # NOTE: We assume all files have same size + "size" => size_to_str(datasize_res), + "eltype" => type_to_str(dataeltype_res), + "add_channelview" => add_channelview ? "1" : "0", + "format" => "image" + ) + else + curr_location.src_parameters + end + + # Store metadata and sample in S3 + if curr_metadata_invalid + Arrow.write(metadata_path, (path=localpaths,); metadata=src_params) + end + if curr_sample_invalid + serialize(sample_path, remote_sample) end # Construct location with metadata - location_res = LocationSource( - "Remote", - if curr_metadata_invalid - empty_part_size = (0, (datasize_res[2:end])...) - Dict{String,Any}( - "meta_path" => meta_path, - "nimages" => nimages, - "nbytes" => nbytes_res, # NOTE: We assume all files have same size - "ndims" => ndims_res, - "size" => datasize_res, - "eltype" => dataeltype_res, - "empty_sample" => to_arrow_string(Base.Array{dataeltype_res}(undef, empty_part_size)), - "add_channelview" => add_channelview, - "format" => "image" - ) - else - curr_location.src_parameters - end, - nbytes_res, - remote_sample, - ) - cache_location(remotepath, remotepath_id, location_res, invalidate_sample, invalidate_metadata) - location_res + LocationSource("Remote", src_parameters, nbytes_res, remote_sample) else INVALID_LOCATION end end -function RemoteImageSource(remotepath; metadata_invalid = false, sample_invalid = false, invalidate_metadata = false, invalidate_sample = false, add_channelview=false)::Location - offloaded( +RemoteImageSource(remotepath, add_channelview)::Location = + RemoteSource( + LocationPath( + remotepath isa String ? remotepath : "lang_jl_$(hash(remotepath))", + add_channelview ? "jl_channelview" : "jl", + Banyan.get_julia_version() + ), _remote_image_source, + deserialize, + identity, + serialize, remotepath, - Banyan.get_remotepath_id(remotepath), - metadata_invalid, - sample_invalid, - invalidate_metadata, - invalidate_sample, - add_channelview; - distributed=true + add_channelview ) -end # TODO: Implement writing diff --git a/BanyanImages/src/pfs.jl b/BanyanImages/src/pfs.jl index c8edf0e8..a01989d8 100644 --- a/BanyanImages/src/pfs.jl +++ b/BanyanImages/src/pfs.jl @@ -12,8 +12,7 @@ function ReadBlockImageHelper( meta_path::String, nimages::Int64, datasize, - empty_sample, - add_channelview::Bool + add_channelview::Int64 ) # path = Banyan.getpath(loc_params["path"]) ? isa(loc_params["path"], String) : path # ndims = loc_params["ndims"] @@ -31,13 +30,13 @@ function ReadBlockImageHelper( files_sub = meta_table.path[filerange] part_size = (length(files_sub), (datasize)[2:end]...) - empty_sample_eltype = eltype(empty_sample) - images = Base.Array{empty_sample_eltype}(undef, part_size) + elty = Banyan.type_from_str(loc_params["eltype"]) + images = Base.Array{elty}(undef, part_size) # TODO: Make it so that the Arrow file only contains the paths and the local paths are computed here for (i, f) in enumerate(files_sub) filepath = Banyan.getpath(f) image = load_retry(filepath) - if add_channelview + if add_channelview == 1 image = ImageCore.channelview(image) images[i, :, :, :] = image else @@ -64,10 +63,9 @@ ReadBlockImage( loc_name, loc_params, loc_params["meta_path"]::String, - loc_params["nimages"]::Int64, - loc_params["size"], - Banyan.from_jl_string(loc_params["empty_sample"]::String), - loc_params["add_channelview"] + parse(Int64, loc_params["nimages"]), + Banyan.size_from_str(loc_params["size"]), + parse(Int64, loc_params["add_channelview"]) ) # function WriteImage( From 08c5fbfa9b66c8c09a6efeeb48b8bd0e4e962204 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Mon, 8 Aug 2022 06:13:57 -0700 Subject: [PATCH 08/25] Implement invalidation functions, update exports, add kwargs to all read functions --- Banyan/src/Banyan.jl | 15 +- Banyan/src/location.jl | 10 +- Banyan/src/locations.jl | 181 +++++++++++---------- Banyan/src/sample.jl | 2 +- Banyan/src/samples.jl | 11 +- Banyan/test/runtests.jl | 6 +- BanyanArrays/test/runtests.jl | 4 - BanyanDataFrames/src/df.jl | 2 + BanyanDataFrames/src/locations.jl | 2 +- BanyanDataFrames/test/sample_collection.jl | 9 +- BanyanDataFrames/test/utils_sessions.jl | 4 - BanyanHDF5/src/hdf5.jl | 2 + BanyanHDF5/src/pfs.jl | 9 +- BanyanHDF5/test/runtests.jl | 4 - BanyanHDF5/test/sample_collection.jl | 17 +- BanyanImages/src/image.jl | 2 + BanyanImages/test/runtests.jl | 4 - BanyanONNXRunTime/test/runtests.jl | 4 - 18 files changed, 142 insertions(+), 146 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 2550c4db..ce14b371 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -93,17 +93,12 @@ export SamplingConfig # Locations export Location, LocationSource, LocationDestination, located, sourced, destined export Value, Size, Client, Disk, None, RemoteSource -export invalidate_all_locations, invalidate_metadata, invalidate_sample -export NOTHING_LOCATION, INVALID_LOCATION +export invalidate_all_locations, invalidate_location, invalidate_metadata, invalidate_samples, invalidate +export NOTHING_LOCATION, INVALID_LOCATION, NO_LOCATION_PATH export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata -export get_remotepath_id, - get_meta_path, - get_location_path, - get_cached_location, - cache_location, - get_max_exact_sample_length, - set_max_exact_sample_length -export LocationPath +export LocationPath, SamplingConfig +export has_metadata, has_sample, get_sample_rate, configure_sampling +export type_to_str, str_to_type # Serialization export from_jl_string, to_jl_string diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 457ed3b1..8f52660e 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -133,7 +133,7 @@ function get_sample_rate(l_path::LocationPath) # Find a cached sample with a similar sample rate pre = get_sample_path_prefix(l_path) banyan_samples_objects = try - res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"] + res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"] res isa Base.Vector ? res : [res] catch return desired_sample_rate @@ -155,7 +155,7 @@ end function has_metadata(l_path:: LocationPath)::Bool try - !isempty(S3.list_objects_v2(Bucket=banyan_metadata_bucket_name(), prefix=get_metadata_path(l_path))["Contents"]) + !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"]) catch false end @@ -165,7 +165,7 @@ function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) try - !isempty(S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"]) + !isempty(S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"]) catch false end @@ -199,7 +199,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path) metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path" src_params_not_stored_locally = false - src_params::Dict{String, String} = if exists(metadata_local_path) + src_params::Dict{String, String} = if isfile(metadata_local_path) lm = Dates.unix2datetime(mtime(metadata_local_path)) if_modified_since_string = "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" @@ -300,7 +300,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} # If no such sample is found, search the S3 bucket banyan_samples_objects = try - res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=sample_path_prefix)["Contents"] + res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"] res isa Base.Vector ? res : [res] catch e if is_debug_on() diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 8c4a0378..7c0a37a2 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -279,21 +279,10 @@ Disk()::Location = deepcopy(DISK) # - You might have lots of huge images # - You might have lots of workers so your sample rate is really large -MAX_EXACT_SAMPLE_LENGTH = parse(Int64, get(ENV, "BANYAN_MAX_EXACT_SAMPLE_LENGTH", "1024")::String) -get_max_exact_sample_length()::Int64 = MAX_EXACT_SAMPLE_LENGTH -function set_max_exact_sample_length(val) - global MAX_EXACT_SAMPLE_LENGTH - MAX_EXACT_SAMPLE_LENGTH = val -end - -getsamplenrows(totalnrows::Int64)::Int64 = - if totalnrows <= get_max_exact_sample_length() - # NOTE: This includes the case where the dataset is empty - # (totalnrows == 0) - totalnrows - else +getsamplenrows(totalnrows::Int64)::Int64 = begin + sc = get_sampling_config() # Must have at least 1 row - cld(totalnrows, get_sample_rate()) + cld(totalnrows, sc.always_exact ? 1 : sc.rate) end # We maintain a cache of locations and a cache of samples. Locations contain @@ -304,88 +293,114 @@ getsamplenrows(totalnrows::Int64)::Int64 = # Banyan is not aware of mutates the location. Locations should be # eventually stored and updated in S3 on each write. -_invalidate_all_locations() = begin - for dir_name in ["banyan_locations", "banyan_meta"] - rm("s3/$(get_cluster_s3_bucket_name())/$dir_name/", force=true, recursive=true) +function invalidate_metadata(p; kwargs...) + lp = get_location_path_with_format(p; kwargs...) + + # Delete locally + p = joinpath(homedir(), ".banyan", "metadata", get_metadata_path(lp)) + if isfile(p) + rm(p) end -end -_invalidate_metadata(remotepath) = - let p = get_location_path(remotepath) - if isfile(p) - loc = deserialize_retry(p) - loc.metadata_invalid = true - serialize(p, loc) + + # Delete from S3 + try + S3.delete_object(banyan_samples_bucket_name(), get_metadata_path(lp)) + catch e + if is_debug_on() + show(e) end end -_invalidate_sample(remotepath) = - let p = get_location_path(remotepath) - if isfile(p) - loc = deserialize_retry(p) - loc.sample_invalid = true - serialize(p, loc) +end +function invalidate_samples(p; kwargs...) + lp = get_location_path_with_format(p; kwargs...) + + # Delete locally + samples_local_dir = joinpath(homedir(), ".banyan", "samples") + if isdir(samples_local_dir) + sample_path_prefix = get_sample_path_prefix(lp) + for local_sample_path in readdir(samples_local_dir, join=true) + if startswith(local_sample_path, sample_path_prefix) + rm(local_sample_path) + end end end -invalidate_all_locations() = offloaded(_invalidate_all_locations) -invalidate_metadata(p) = offloaded(_invalidate_metadata, p) -invalidate_sample(p) = offloaded(_invalidate_sample, p) - -@specialize -# Helper functions for location constructors; these should only be called from the main worker - -# TODO: Hash in a more general way so equivalent paths hash to same value -# This hashes such that an extra slash at the end won't make a difference`` -get_remotepath_id(remotepath::String) = - (get_julia_version(), (remotepath |> splitpath |> joinpath)) |> hash -get_remotepath_id(remotepath) = (get_julia_version(), remotepath) |> hash -function get_location_path(remotepath, remotepath_id) - session_s3_bucket_name = get_cluster_s3_bucket_name() - if !isdir("s3/$session_s3_bucket_name/banyan_locations/") - mkdir("s3/$session_s3_bucket_name/banyan_locations/") + # Delete from S3 + banyan_samples_objects = try + res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"] + res isa Base.Vector ? res : [res] + catch e + if is_debug_on() + show(e) + end + [] end - "s3/$session_s3_bucket_name/banyan_locations/$(remotepath_id)" -end -function get_meta_path(remotepath, remotepath_id) - session_s3_bucket_name = get_cluster_s3_bucket_name() - if !isdir("s3/$session_s3_bucket_name/banyan_meta/") - mkdir("s3/$session_s3_bucket_name/banyan_meta/") + if !isempty(banyan_samples_objects) + objects_to_delete = [] + for d in banyan_samples_objects + push!(objects_to_delete, Dict("Key" => d["Key"])) + end + S3.delete_objects( + banyan_samples_bucket_name(), + Dict("Objects" => objects_to_delete) + ) end - "s3/$session_s3_bucket_name/banyan_meta/$remotepath_id" end -get_location_path(remotepath) = - get_location_path(remotepath, get_remotepath_id(remotepath)) -get_meta_path(remotepath) = - get_meta_path(remotepath, get_remotepath_id(remotepath)) - -function get_cached_location(remotepath, remotepath_id, metadata_invalid, sample_invalid) - Random.seed!(hash((get_session_id(), remotepath_id))) - session_s3_bucket_name = get_cluster_s3_bucket_name() - location_path = "s3/$session_s3_bucket_name/banyan_locations/$remotepath_id" - - curr_location::Location = try - deserialize_retry(location_path) - catch - INVALID_LOCATION - end - curr_location.sample_invalid = curr_location.sample_invalid || sample_invalid - curr_location.metadata_invalid = curr_location.metadata_invalid || metadata_invalid - curr_sample_invalid = curr_location.sample_invalid - curr_metadata_invalid = curr_location.metadata_invalid - curr_location, curr_sample_invalid, curr_metadata_invalid +function invalidate_location(p; kwargs...) + invalidate_metadata(p; kwargs...) + invalidate_samples(p; kwargs...) end +function invalidate_all_locations(p; kwargs...) + for subdir in ["samples", "metadata"] + local_dir = joinpath(homedir(), ".banyan", subdir) + if isdir(samples_local_dir) + rm(local_dir; force=true, recrusive=true) + end + end -get_cached_location(remotepath, metadata_invalid, sample_invalid) = - get_cached_location(remotepath, get_remotepath_id(remotepath), metadata_invalid, sample_invalid) + # Delete from S3 + for bucket_name in [banyan_samples_bucket_name(), banyan_metadata_bucket_name()] + banyan_samples_objects = try + res = S3.list_objects_v2(bucket_name)["Contents"] + res isa Base.Vector ? res : [res] + catch e + if is_debug_on() + show(e) + end + [] + end + if !isempty(banyan_samples_objects) + objects_to_delete = [] + for d in banyan_samples_objects + push!(objects_to_delete, Dict("Key" => d["Key"])) + end + try + S3.delete_objects( + banyan_samples_bucket_name(), + Dict("Objects" => objects_to_delete) + ) + catch e + if is_debug_on() + show(e) + end + end + end + end +end -function cache_location(remotepath, remotepath_id, location_res::Location, invalidate_sample, invalidate_metadata) - location_path = get_location_path(remotepath, remotepath_id) - location_to_write = deepcopy(location_res) - location_to_write.sample_invalid = location_to_write.sample_invalid || invalidate_sample - location_to_write.metadata_invalid = location_to_write.metadata_invalid || invalidate_metadata - serialize(location_path, location_to_write) +function invalidate(p; after=false, kwargs...) + if get(kwargs, after ? :invalidate_all_locations : :all_locations_invalid, false) + invalidate_all_location() + elseif get(kwargs, after ? :invalidate_location : :location_invalid, false) + invalidate_location(p; kwargs...) + elseif get(kwargs, after ? :invalidate_metadata : :metadata_invalid, false) + invalidate_metadata(p; kwargs...) + elseif get(kwargs, after ? :invalidate_samples : :samples_invalid, false) + invalidate_samples(p; kwargs...) + end end -cache_location(remotepath, location_res::Location, invalidate_sample, invalidate_metadata) = - cache_location(remotepath, get_remotepath_id(remotepath), location_res, invalidate_sample, invalidate_metadata) + +@specialize # Functions to be extended for different data formats diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl index 1837e244..da4f70f8 100644 --- a/Banyan/src/sample.jl +++ b/Banyan/src/sample.jl @@ -31,5 +31,5 @@ struct SamplingConfig assume_shuffled::Bool end -const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false, true) +const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true) session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG)) \ No newline at end of file diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 7d93572d..667f5fbd 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -6,17 +6,18 @@ function configure_sampling( force_new_sample_rate=nothing, assume_shuffled=nothing, for_all_locations=false, + default=false, kwargs... ) global session_sampling_configs sc = get_sampling_config(path; kwargs...) nsc = SamplingConfig( - !isnothing(sample_rate) ? rate : sc.rate, - !isnothing(always_exact) ? always_exact : sc.always_exact, - !isnothing(max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact, - !isnothing(force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate, - !isnothing(assume_shuffled) ? assume_shuffled : sc.assume_shuffled, + (!isnothing(sample_rate) && !default) ? rate : sc.rate, + (!isnothing(always_exact) && !default) ? always_exact : sc.always_exact, + (!isnothing(max_num_bytes_exact) && !default) ? max_num_bytes_exact : sc.max_num_bytes_exact, + (!isnothing(force_new_sample_rate) && !default) ? force_new_sample_rate : sc.force_new_sample_rate, + (!isnothing(assume_shuffled) && !default) ? assume_shuffled : sc.assume_shuffled, ) session_id = _get_session_id_no_error() diff --git a/Banyan/test/runtests.jl b/Banyan/test/runtests.jl index 911c5324..36ea9c3b 100644 --- a/Banyan/test/runtests.jl +++ b/Banyan/test/runtests.jl @@ -18,7 +18,6 @@ function use_session_for_testing( nworkers = parse(Int64, get(ENV, "BANYAN_NWORKERS", "2")), sample_rate = 2, nworkers = 2, - max_exact_sample_length = 50, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( @@ -67,10 +66,7 @@ function use_session_for_testing( end ) # If selected session has already failed, this will throw an error. - sessions_for_testing[session_config_hash] = get_session_id() - - # Set the maximum exact sample length - set_max_exact_sample_length(max_exact_sample_length) + sessions_for_testing[session_config_hash] = get_session_id()(max_exact_sample_length) configure_scheduling(name = scheduling_config_name) diff --git a/BanyanArrays/test/runtests.jl b/BanyanArrays/test/runtests.jl index f0430d4d..a7b9bbc7 100644 --- a/BanyanArrays/test/runtests.jl +++ b/BanyanArrays/test/runtests.jl @@ -15,7 +15,6 @@ end function use_session_for_testing( f::Function; sample_rate = 2, - max_exact_sample_length = 50, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( @@ -69,9 +68,6 @@ function use_session_for_testing( # If selected session has already failed, this will throw an error. sessions_for_testing[session_config_hash] = get_session_id() - # Set the maximum exact sample length - set_max_exact_sample_length(max_exact_sample_length) - configure_scheduling(name = scheduling_config_name) try diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl index 485a0d12..b51eb4f0 100644 --- a/BanyanDataFrames/src/df.jl +++ b/BanyanDataFrames/src/df.jl @@ -49,8 +49,10 @@ Base.propertynames(df::DataFrame) = propertynames(sample(df)::DataFrames.DataFra function read_table(path::String; kwargs...) @nospecialize + invalidate(path; kwargs...) df_loc = RemoteTableSource(path; kwargs...) df_loc.src_name == "Remote" || error("$path does not exist") + invalidate(path; after=true, kwargs...) df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"]) df_nrows = Future(df_loc_nrows) DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows) diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index ab369514..17bca8f0 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -206,7 +206,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: total_nbytes_res = reduce_and_sync_across(+, local_nbytes) # If the sample is too small, redo it, getting an exact sample - if !exact_sample_needed_res && total_nbytes_res < max_exact_sample_length + if !exact_sample_needed_res && total_nbytes_res <= sampling_config.max_num_bytes_exact exact_sample_needed = true exact_sample_needed_res = true else diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index 1634501e..88a85235 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -26,7 +26,6 @@ reusing in ["nothing", "sample", "location", "sample and location"] # Use session with appropriate sample collection configuration - max_exact_sample_length = exact_or_inexact == "Exact" ? 1_024_000 : 0 use_session_for_testing(sample_rate = 2) do # Use data to collect a sample from @@ -44,11 +43,11 @@ if (reusing == "nothing" || reusing == "location") invalidate_sample(src_name) end - remote_source = RemoteTableSource( - src_name, - shuffled = with_or_without_shuffled == "with", - max_exact_sample_length = max_exact_sample_length + configure_sampling( + always_exact = exact_or_inexact, + assume_shuffled = with_or_without_shuffled == "with", ) + remote_source = RemoteTableSource(src_name) # Verify the location diff --git a/BanyanDataFrames/test/utils_sessions.jl b/BanyanDataFrames/test/utils_sessions.jl index 0abe03db..d569af08 100644 --- a/BanyanDataFrames/test/utils_sessions.jl +++ b/BanyanDataFrames/test/utils_sessions.jl @@ -11,7 +11,6 @@ end function use_session_for_testing( f::Function; sample_rate = 2, - max_exact_sample_length = 50, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( @@ -67,9 +66,6 @@ function use_session_for_testing( # If selected session has already failed, this will throw an error. sessions_for_testing[session_config_hash] = get_session_id() - # Set the maximum exact sample length - set_max_exact_sample_length(max_exact_sample_length) - configure_scheduling(name = scheduling_config_name) try diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl index d7f8959a..3bef15de 100644 --- a/BanyanHDF5/src/hdf5.jl +++ b/BanyanHDF5/src/hdf5.jl @@ -1,6 +1,8 @@ function read_hdf5(path; kwargs...) + invalidate(path; kwargs...) A_loc = RemoteHDF5Source(path; kwargs...) A_loc.src_name == "Remote" || error("$path does not exist") + invalidate(path; after=true, kwargs...) A = Future(datatype="Array", source=A_loc) A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"]) A_loc_eltype = Banyan.type_from_str(A_loc.src_parameters["eltype"]) diff --git a/BanyanHDF5/src/pfs.jl b/BanyanHDF5/src/pfs.jl index b6d551a0..73456013 100644 --- a/BanyanHDF5/src/pfs.jl +++ b/BanyanHDF5/src/pfs.jl @@ -154,10 +154,11 @@ function WriteHelperHDF5( is_main = worker_idx == 1 if is_main # We invalidate both the location and the metadata in this case - serialize( - Banyan.get_location_path(path_and_subpath), - INVALID_LOCATION - ) + invalidate_location(loc_params_path) + # serialize( + # Banyan.get_location_path(path_and_subpath), + # INVALID_LOCATION + # ) end # Invalidate location if diff --git a/BanyanHDF5/test/runtests.jl b/BanyanHDF5/test/runtests.jl index 739ed349..8b155e77 100644 --- a/BanyanHDF5/test/runtests.jl +++ b/BanyanHDF5/test/runtests.jl @@ -16,7 +16,6 @@ end function use_session_for_testing( f::Function; sample_rate = 2, - max_exact_sample_length = 50, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( @@ -71,9 +70,6 @@ function use_session_for_testing( # If selected session has already failed, this will throw an error. sessions_for_testing[session_config_hash] = get_session_id() - # Set the maximum exact sample length - set_max_exact_sample_length(max_exact_sample_length) - configure_scheduling(name = scheduling_config_name) try diff --git a/BanyanHDF5/test/sample_collection.jl b/BanyanHDF5/test/sample_collection.jl index 82e849bb..d6544edc 100644 --- a/BanyanHDF5/test/sample_collection.jl +++ b/BanyanHDF5/test/sample_collection.jl @@ -19,7 +19,6 @@ reusing in ["nothing", "sample", "location", "sample and location"] # Use session with appropriate sample collection configuration - max_exact_sample_length = exact_or_inexact == "Exact" ? 1_024_000 : 0 use_session_for_testing(sample_rate = 2) do # Use data to collect a sample from @@ -30,13 +29,17 @@ RemoteHDF5Source(src_name, invalidate_metadata = true, invalidate_sample = true) RemoteHDF5Source(src_name, metadata_invalid = true, sample_invalid = true) end - remote_source = RemoteHDF5Source( - src_name, - metadata_invalid = (reusing == "nothing" || reusing == "sample"), - sample_invalid = (reusing == "nothing" || reusing == "location"), - shuffled = with_or_without_shuffled == "with", - max_exact_sample_length = max_exact_sample_length + configure_sampling( + always_exact = exact_or_inexact == "Exact", + assume_shuffled = with_or_without_shuffled == "with" ) + if (reusing == "nothing" || reusing == "sample") + invalidate_metadata(src_name) + end + if (reusing == "nothing" || reusing == "location") + invalidate_locations(src_name) + end + remote_source = RemoteHDF5Source(src_name) # Verify the location if contains(src_name, "h5") diff --git a/BanyanImages/src/image.jl b/BanyanImages/src/image.jl index 77e4cd1a..b413a9ce 100644 --- a/BanyanImages/src/image.jl +++ b/BanyanImages/src/image.jl @@ -1,6 +1,8 @@ function read_png(path; add_channelview=false) + invalidate(path; kwargs...) image_loc = RemoteImageSource(path, add_channelview) image_loc.src_name == "Remote" || error("$path does not exist") + invalidate(path; after=true, kwargs...) image = Future(;source=image_loc, datatype="Array") image_loc_eltype = type_from_str(image_loc.src_parameters["eltype"]) image_loc_size = size_from_str(image_loc.src_parameters["size"]) diff --git a/BanyanImages/test/runtests.jl b/BanyanImages/test/runtests.jl index e8e485e3..695a4f22 100644 --- a/BanyanImages/test/runtests.jl +++ b/BanyanImages/test/runtests.jl @@ -24,7 +24,6 @@ end function use_session_for_testing( f::Function; sample_rate = 2, - max_exact_sample_length = 50, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( @@ -79,9 +78,6 @@ function use_session_for_testing( # If selected session has already failed, this will throw an error. sessions_for_testing[session_config_hash] = get_session_id() - # Set the maximum exact sample length - set_max_exact_sample_length(max_exact_sample_length) - configure_scheduling(name = scheduling_config_name) try diff --git a/BanyanONNXRunTime/test/runtests.jl b/BanyanONNXRunTime/test/runtests.jl index de03c809..cbe80928 100644 --- a/BanyanONNXRunTime/test/runtests.jl +++ b/BanyanONNXRunTime/test/runtests.jl @@ -23,7 +23,6 @@ function use_session_for_testing( f::Function; nworkers = 2, sample_rate = 2, - max_exact_sample_length = 50, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( @@ -78,9 +77,6 @@ function use_session_for_testing( # If selected session has already failed, this will throw an error. sessions_for_testing[session_config_hash] = get_session_id() - # Set the maximum exact sample length - set_max_exact_sample_length(max_exact_sample_length) - configure_scheduling(name = scheduling_config_name) try From ee901a80f2c63e917cd18167c1d92aaa7de36d75 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Mon, 8 Aug 2022 12:52:50 -0700 Subject: [PATCH 09/25] Implement parallel SQS data transfer and eliminate AWSSQS.jl dependency --- Banyan/src/Banyan.jl | 9 +-- Banyan/src/queues.jl | 151 +++++++++++++++++++------------------ Banyan/src/requests.jl | 91 ++++++++++++---------- Banyan/src/utils_queues.jl | 3 +- Banyan/test/Project.toml | 1 - Project.toml | 2 + 6 files changed, 137 insertions(+), 120 deletions(-) create mode 100644 Project.toml diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index ce14b371..9dc9f186 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -21,10 +21,7 @@ global NOT_USING_MODULES = String["ProfileView", "SnoopCompileCore"] using FilePathsBase: joinpath, isempty using Base: notnothing, env_project_file -using AWSCore, - AWSS3, - AWSSQS, - Base64, +using Base64, DataStructures, Dates, Downloads, @@ -41,8 +38,10 @@ using AWSCore, TOML using AWS.AWSServices: s3 -using S3: @service +using AWS: @service @service S3 +@service SQS +using AWSS3 global BANYAN_API_ENDPOINT diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index 2cc6c23c..412aef57 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -2,59 +2,28 @@ # GET QUEUE URL # ################# -get_sqs_dict_from_url(url::String)::Dict{Symbol,Any} = - merge( - get_aws_config(), - Dict(:resource => "/" * replace(joinpath(splitpath(url)[end-1:end]), "\\"=>"/")) - ) - -get_scatter_queue()::Dict{Symbol,Any} = - get_sqs_dict_from_url(get_session().scatter_queue_url) - -get_gather_queue()::Dict{Symbol,Any} = - get_sqs_dict_from_url(get_session().gather_queue_url) - -get_execution_queue()::Dict{Symbol,Any} = - get_sqs_dict_from_url(get_session().execution_queue_url) +scatter_queue_url()::Dict{Symbol,Any} = get_session().scatter_queue_url +gather_queue_url()::Dict{Symbol,Any} = get_session().gather_queue_url +execution_queue_url()::Dict{Symbol,Any} = get_session().execution_queue_url ################### # RECEIVE MESSAGE # ################### -function sqs_receive_message_with_long_polling(queue) - r = AWSSQS.sqs(queue, "ReceiveMessage", MaxNumberOfMessages = "1") - r = r["messages"] - - if isnothing(r) - return nothing - end - - handle = r[1]["ReceiptHandle"] - id = r[1]["MessageId"] - message = r[1]["Body"] - md5 = r[1]["MD5OfBody"] - - Dict{Symbol,Any}( - :message => message, - :id => id, - :handle => handle - ) -end - function get_next_message( - queue, + queue_url, p::Union{Nothing,ProgressMeter.ProgressUnknown} = nothing; delete::Bool = true, error_for_main_stuck::Union{Nothing,String} = nothing, error_for_main_stuck_time::Union{Nothing,DateTime} = nothing )::Tuple{String,Union{Nothing,String}} -error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_stuck_time) - m = sqs_receive_message_with_long_polling(queue) + error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_stuck_time) + m = SQS.receive_message(queue_url, Dict("MaxNumberOfMessages" => "1")) i = 1 j = 1 - while (isnothing(m)) + while (!haskey(m, "ReceiveMessageResult") || !haskey(m["ReceiveMessageResult"], "Message")) error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_stuck_time) - m = sqs_receive_message_with_long_polling(queue) + m = SQS.receive_message(queue_url, Dict("MaxNumberOfMessages" => "1")) i += 1 if !isnothing(p) p::ProgressMeter.ProgressUnknown @@ -62,19 +31,21 @@ error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_s j += 1 end end + m_dict = m["ReceiveMessageResult"]["Message"] if delete - sqs_delete_message(queue, m) + SQS.delete_message(queue_url, m_dict["ReceiptHandle"]::String) end - return m[:message]::String, error_for_main_stuck + return m_dict["Body"]::String, error_for_main_stuck end -function receive_next_message( +function sqs_receive_next_message( queue_name, p=nothing, error_for_main_stuck=nothing, error_for_main_stuck_time=nothing )::Tuple{Dict{String,Any},Union{Nothing,String}} - content::String, error_for_main_stuck::Union{Nothing,String} = get_next_message(queue_name, p; error_for_main_stuck=error_for_main_stuck, error_for_main_stuck_time=error_for_main_stuck_time) + content::String, error_for_main_stuck::Union{Nothing,String} = + get_next_message(queue_name, p; error_for_main_stuck=error_for_main_stuck, error_for_main_stuck_time=error_for_main_stuck_time) res::Dict{String,Any} = if startswith(content, "JOB_READY") || startswith(content, "SESSION_READY") Dict{String,Any}( "kind" => "SESSION_READY" @@ -126,12 +97,9 @@ end function receive_from_client(value_id::ValueId) # Send scatter message to client message = Dict{String,String}("kind" => "SCATTER_REQUEST", "value_id" => value_id) - send_message( - get_gather_queue(), - JSON.json(message) - ) + sqs_send_message(gather_queue_url(), JSON.json(message)) # Receive response from client - m = JSON.parse(get_next_message(get_scatter_queue())[1]) + m = JSON.parse(get_next_message(scatter_queue_url())[1]) v = from_jl_string(m["contents"]::String) v end @@ -141,44 +109,81 @@ end # SEND MESSAGE # ################ -function send_message(queue_name, message) +function sqs_send_message(queue_url, message) generated_message_id = generate_message_id() - sqs_send_message( - queue_name, + SQS.send_message( + queue_url, message, - (:MessageGroupId, "1"), - (:MessageDeduplicationId, generated_message_id), + Dict( + "MessageGroupId" => "1", + "MessageDeduplicationId" => generated_message_id + ) ) end function send_to_client(value_id::ValueId, value, worker_memory_used = 0) MAX_MESSAGE_LENGTH = 220_000 message = to_jl_string(value)::String - i = 1 + + # Break the message down into chunk ranges + nmessages = 0 + message_length = length(message) + message_ranges = [] + message_i = 1 while true - is_last_message = length(message) <= MAX_MESSAGE_LENGTH + is_last_message = message_length <= MAX_MESSAGE_LENGTH + starti = message_i + if is_last_message + message_i += message_length + message_length = 0 + else + message_i += MAX_MESSAGE_LENGTH + message_length -= MAX_MESSAGE_LENGTH + end + push!(message_ranges, starti:message_i) + nmessages += 1 + if is_last_message + break + end + end + + # Launch asynchronous threads to send SQS messages + gather_q_url = gather_queue_url() + num_chunks = length(message_ranges) + if num_chunks > 1 + @sync for i = 1:message_ranges + @async begin + msg = Dict{String,Any}( + "kind" => "GATHER", + "value_id" => value_id, + "contents" => message[message_ranges[i]], + "worker_memory_used" => worker_memory_used, + "chunk_idx" => i, + "num_chunks" => num_chunks + ) + msg_json = JSON.json(msg) + SQS.send_message( + msg_json, + gather_q_url, + Dict("MessageGroupId" => string(i)) + ) + end + end + else + i = 1 msg = Dict{String,Any}( - "kind" => (is_last_message ? "GATHER_END" : "GATHER"), + "kind" => "GATHER", "value_id" => value_id, - "contents" => if is_last_message - message - else - msg = message[1:MAX_MESSAGE_LENGTH] - message = message[MAX_MESSAGE_LENGTH+1:end] - msg - end, + "contents" => message[message_ranges[i]], "worker_memory_used" => worker_memory_used, - "gather_page_idx" => i + "chunk_idx" => i, + "num_chunks" => num_chunks ) - send_message( - get_gather_queue(), - JSON.json( - msg - ) + msg_json = JSON.json(msg) + SQS.send_message( + msg_json, + gather_q_url, + Dict("MessageGroupId" => string(i)) ) - i += 1 - if is_last_message - break - end end end diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 6be89660..64a44981 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -18,13 +18,13 @@ ############################# function check_worker_stuck_error( - message::Dict{String,Any}, + value_id::ValueId, + contents::String, error_for_main_stuck::Union{Nothing,String}, error_for_main_stuck_time::Union{Nothing,DateTime} )::Tuple{Union{Nothing,String},Union{Nothing,DateTime}} - value_id = message["value_id"]::ValueId if value_id == "-2" && isnothing(error_for_main_stuck_time) - error_for_main_stuck_msg::String = from_jl_string(message["contents"]::String) + error_for_main_stuck_msg::String = from_jl_string(contents) if contains(error_for_main_stuck_msg, "session $(get_session_id())") error_for_main_stuck = error_for_main_stuck_msg error_for_main_stuck_time = Dates.now() @@ -230,8 +230,8 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n end # Get queues for moving data between client and cluster - scatter_queue = get_scatter_queue() - gather_queue = get_gather_queue() + scatter_queue = scatter_queue_url() + gather_queue = gather_queue_url() # There are two cases: either we # TODO: Maybe we don't need to wait_For_session @@ -261,10 +261,9 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n p = ProgressUnknown("Computing value with ID $(fut.value_id)", spinner=true) error_for_main_stuck::Union{Nothing,String} = nothing error_for_main_stuck_time::Union{Nothing,DateTime} = nothing - partial_gathers = Dict{ValueId,String}() while true # TODO: Use to_jl_value and from_jl_value to support Client - message, error_for_main_stuck = receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time) + message, error_for_main_stuck = sqs_receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time) message_type::String = message["kind"] if message_type == "SCATTER_REQUEST" # Send scatter @@ -272,7 +271,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n haskey(session.futures_on_client, value_id) || error("Expected future to be stored on client side") f = session.futures_on_client[value_id]::Future # @debug "Received scatter request for value with ID $value_id and value $(f.value) with location $(get_location(f))" - send_message( + sqs_send_message( scatter_queue, JSON.json( Dict{String,Any}( @@ -286,23 +285,33 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n elseif message_type == "GATHER" # Receive gather value_id = message["value_id"]::ValueId - if !haskey(partial_gathers, value_id) - partial_gathers[value_id] = message["contents"]::String + num_chunks = message["num_chunks"]::Int64 + num_remaining_chunks = num_chunks - 1 + + whole_message_contents = if num_chunks > 1 + partial_messages = Vector{String}(undef, num_chunks) + partial_messages[message["chunk_idx"]] = message["contents"] + @sync for i = 1:num_remaining_chunks + @async begin + partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) + chunk_idx = partial_message["chunk_idx"] + partial_messages[chunk_idx] = message["contents"] + end + end + join(partial_messages) else - partial_gathers[value_id] *= message["contents"]::String + message["contents"] end - elseif message_type == "GATHER_END" - value_id = message["value_id"]::ValueId - contents = get(partial_gathers, value_id, "") * message["contents"]::String - # @debug "Received gather request for $value_id" + if haskey(session.futures_on_client, value_id) - value = from_jl_string(contents) + value = from_jl_string(whole_message_contents) f = session.futures_on_client[value_id]::Future f.value = value # TODO: Update stale/mutated here to avoid costly # call to `send_evaluation` end - error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(message, error_for_main_stuck, error_for_main_stuck_time) + + error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time) elseif message_type == "EVALUATION_END" if message["end"]::Bool == true break @@ -683,39 +692,43 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) p = ProgressUnknown("Running offloaded code", spinner=true) session = get_session() - gather_queue = get_gather_queue() + gather_queue = gather_queue_url() stored_message = nothing error_for_main_stuck, error_for_main_stuck_time = nothing, nothing partial_gathers = Dict{ValueId,String}() while true - message, error_for_main_stuck = receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time) + message, error_for_main_stuck = sqs_receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time) message_type = message["kind"]::String if message_type == "GATHER" # Receive gather value_id = message["value_id"]::ValueId - contents = message["contents"]::String - if !haskey(partial_gathers, value_id) - partial_gathers[value_id] = contents + num_chunks = message["num_chunks"]::Int64 + num_remaining_chunks = num_chunks - 1 + + whole_message_contents = if num_chunks > 1 + partial_messages = Vector{String}(undef, num_chunks) + partial_messages[message["chunk_idx"]] = message["contents"] + @sync for i = 1:num_remaining_chunks + @async begin + partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) + chunk_idx = partial_message["chunk_idx"] + partial_messages[chunk_idx] = message["contents"] + end + end + join(partial_messages) else - partial_gathers[value_id] *= contents + message["contents"] end - elseif message_type == "GATHER_END" - value_id = message["value_id"]::ValueId - contents = get(partial_gathers, value_id, "") * message["contents"]::String - if (value_id == "-1") - memory_used = message["worker_memory_used"]::Int64 - if Banyan.INVESTIGATING_MEMORY_USAGE - @show get_session().worker_memory_used - @show memory_used - end - # Note that while the memory usage from offloaded computation does get - # reset with each session even if it reuses the same job, we do - # recompute the initial available memory every time we start a session - # and this should presumably include the offloaded memory usage. - get_session().worker_memory_used = get_session().worker_memory_used + memory_used - stored_message = from_jl_string(contents) + + if haskey(session.futures_on_client, value_id) + value = from_jl_string(whole_message_contents) + f = session.futures_on_client[value_id]::Future + f.value = value + # TODO: Update stale/mutated here to avoid costly + # call to `send_evaluation` end - error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(message, error_for_main_stuck, error_for_main_stuck_time) + + error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time) elseif (message_type == "EVALUATION_END") if message["end"]::Bool == true return stored_message diff --git a/Banyan/src/utils_queues.jl b/Banyan/src/utils_queues.jl index 17755bf6..da73b514 100644 --- a/Banyan/src/utils_queues.jl +++ b/Banyan/src/utils_queues.jl @@ -1,5 +1,4 @@ using Dates -using AWSSQS @nospecialize @@ -33,7 +32,7 @@ function run_with_retries( end sqs_get_queue_with_retries(args...; kwargs...) = run_with_retries( - sqs_get_queue, + SQS.get_queue_url, args...; failure_message = "Queue for communicating results is nonexistent", kwargs... diff --git a/Banyan/test/Project.toml b/Banyan/test/Project.toml index 229e8c72..dd440603 100644 --- a/Banyan/test/Project.toml +++ b/Banyan/test/Project.toml @@ -2,7 +2,6 @@ AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598" AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" -AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812" Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" diff --git a/Project.toml b/Project.toml new file mode 100644 index 00000000..8aa62950 --- /dev/null +++ b/Project.toml @@ -0,0 +1,2 @@ +[deps] +AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" From 8daff0a7bd2d0aa33193c2abc4a5f25907a86994 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 9 Aug 2022 07:25:04 -0700 Subject: [PATCH 10/25] Remove redundant total_memory_usage --- Banyan/src/Banyan.jl | 6 +- Banyan/src/annotation.jl | 16 ++--- Banyan/src/clusters.jl | 6 +- Banyan/src/future.jl | 4 +- Banyan/src/futures.jl | 2 +- Banyan/src/location.jl | 50 ++++++-------- Banyan/src/locations.jl | 22 +++--- Banyan/src/precompile.jl | 5 +- Banyan/src/sample.jl | 15 ++-- Banyan/src/samples.jl | 5 -- Banyan/src/sessions.jl | 18 ++--- Banyan/src/utils.jl | 58 ++-------------- Banyan/src/utils_s3fs.jl | 6 +- Banyan/test/clusters.jl | 16 ++--- Banyan/test/sessions.jl | 2 +- BanyanDataFrames/src/BanyanDataFrames.jl | 2 +- BanyanDataFrames/src/gdf.jl | 2 +- BanyanDataFrames/src/locations.jl | 14 ++-- BanyanDataFrames/src/pfs.jl | 12 ++-- BanyanDataFrames/test/latency.jl | 4 +- BanyanDataFrames/test/runtests.jl | 2 +- .../test/runtests_without_retest.jl | 6 +- BanyanDataFrames/test/sample_collection.jl | 69 +++++++++++++++++-- BanyanDataFrames/test/utils_data.jl | 38 +++++----- BanyanHDF5/src/locations.jl | 2 +- BanyanHDF5/test/runtests.jl | 4 +- BanyanImages/src/locations.jl | 4 +- BanyanImages/test/locations.jl | 2 +- BanyanImages/test/pfs.jl | 2 +- BanyanImages/test/utils_data.jl | 10 +-- BanyanONNXRunTime/src/locations.jl | 2 +- 31 files changed, 203 insertions(+), 203 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 9dc9f186..640175cf 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -29,7 +29,6 @@ using Base64, FilePathsBase, HTTP, JSON, - IniFile, LibGit2, MPI, ProgressMeter, @@ -37,6 +36,7 @@ using Base64, Serialization, TOML +using AWS using AWS.AWSServices: s3 using AWS: @service @service S3 @@ -85,7 +85,7 @@ export AbstractFuture, Future, partitioned_computation, compute_inplace, compute # Samples export Sample, ExactSample, sample, sample_for_grouping, SampleForGrouping, setsample! -export sample_memory_usage, total_memory_usage, sample_axes, sample_keys, sample_by_key +export sample_memory_usage, sample_memory_usage, sample_axes, sample_keys, sample_by_key export NOTHING_SAMPLE export SamplingConfig @@ -189,7 +189,7 @@ export is_debug_on, export Empty, EMPTY, nonemptytype, disallowempty, empty_handler # Utilities for location constructors -export get_cached_location, cache_location, get_sample_from_data, sample_from_range +export get_sample_from_data, sample_from_range # Partitioning functions for usage in sessions that run on the cluster; dispatched # based on `res/pf_dispatch_table.json`. diff --git a/Banyan/src/annotation.jl b/Banyan/src/annotation.jl index 7a8a0891..9c5d3243 100644 --- a/Banyan/src/annotation.jl +++ b/Banyan/src/annotation.jl @@ -611,8 +611,8 @@ function apply_mutation(old::Future, new::Future) new.mutated, old.stale, new.stale, - old.total_memory_usage, - new.total_memory_usage, + old.sample_memory_usage, + new.sample_memory_usage, session_locations[old.value_id], session_locations[new.value_id] = new.value, @@ -623,8 +623,8 @@ function apply_mutation(old::Future, new::Future) old.mutated, new.stale, old.stale, - new.total_memory_usage, - old.total_memory_usage, + new.sample_memory_usage, + old.sample_memory_usage, session_locations[new.value_id], session_locations[old.value_id] end @@ -675,11 +675,11 @@ function finish_partitioned_code_region(splatted_futures::Vector{Future}) # Get the initial memory usage for fut in splatted_futures - fut_initial_memory_usage::Int64 = if is_total_memory_usage_known(fut) - fut.total_memory_usage + fut_initial_memory_usage::Int64 = if is_sample_memory_usage_known(fut) + fut.sample_memory_usage else tmu::Int64 = try - get_location(fut).total_memory_usage + get_location(fut).sample_memory_usage catch e if e isa MethodError error("Future with value ID $(fut.value_id) has no initial memory usage even in location with source name $(get_location(fut).src_name)") @@ -877,7 +877,7 @@ function finish_partitioned_code_region(splatted_futures::Vector{Future}) # Destroy value IDs that are no longer needed because of mutation for fut in splatted_futures - fut.total_memory_usage = task.memory_usage[fut.value_id]["final"] + fut.sample_memory_usage = task.memory_usage[fut.value_id]["final"] # Issue destroy request for mutated futures that are no longer # going to be used diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 91c63217..c96efa47 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -75,7 +75,7 @@ function create_cluster(; end if isnothing(s3_bucket_arn) s3_bucket_arn = "" - elseif !(s3_bucket_name in s3_list_buckets(get_aws_config())) + elseif !(s3_bucket_name in s3_list_buckets(global_aws_config())) error("Bucket $s3_bucket_name does not exist in the connected AWS account") end @@ -294,7 +294,7 @@ end function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cluster_name(), kwargs...) configure(; kwargs...) bucket_name = get_cluster_s3_bucket_name(cluster_name) - s3_dst_path = S3Path("s3://$bucket_name/$dst_name", config=get_aws_config()) + s3_dst_path = S3Path("s3://$bucket_name/$dst_name", config=global_aws_config()) if startswith(src_path, "http://") || startswith(src_path, "https://") Downloads.download( src_path, @@ -320,7 +320,7 @@ function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cl Path("$src_path/$f_name"), S3Path( "s3://$bucket_name/$(basename(src_path))/$(f_name)", - config=get_aws_config() + config=global_aws_config() ) ) end diff --git a/Banyan/src/future.jl b/Banyan/src/future.jl index 74318306..7fc1e3f0 100644 --- a/Banyan/src/future.jl +++ b/Banyan/src/future.jl @@ -4,7 +4,7 @@ mutable struct Future <: AbstractFuture value_id::ValueId mutated::Bool stale::Bool - total_memory_usage::Int64 + sample_memory_usage::Int64 end const NOTHING_FUTURE = Future("", nothing, "", false, false, -1) @@ -12,7 +12,7 @@ Base.isnothing(f::Future) = isempty(f.value_id) Base.hash(f::Future) = hash(f.value_id) -is_total_memory_usage_known(f::Future) = f.total_memory_usage != -1 +is_sample_memory_usage_known(f::Future) = f.sample_memory_usage != -1 isview(f::AbstractFuture) = false diff --git a/Banyan/src/futures.jl b/Banyan/src/futures.jl index 01cbde61..c1e769a1 100644 --- a/Banyan/src/futures.jl +++ b/Banyan/src/futures.jl @@ -46,7 +46,7 @@ function create_new_future(source::Location, mutate_from::Future, datatype::Stri end function create_future_from_sample(value::T, datatype::String)::Future where T - location::Location = if total_memory_usage(value) ≤ 4 * 1024 + location::Location = if sample_memory_usage(value) ≤ 4 * 1024 Value(value) else # TODO: Store values in S3 instead so that we can read from there diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 8f52660e..85a79dc7 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -8,36 +8,10 @@ mutable struct Location dst_name::String src_parameters::LocationParameters dst_parameters::LocationParameters - total_memory_usage::Int64 + sample_memory_usage::Int64 sample::Sample metadata_invalid::Bool sample_invalid::Bool - - # function Location( - # src_name::String, - # dst_name::String, - # src_parameters::Dict{String,<:Any}, - # dst_parameters::Dict{String,<:Any}, - # total_memory_usage::Union{Int64,Nothing} = nothing, - # sample::Sample = Sample(), - # ) - # # NOTE: A file might be None and None if it is simply to be cached on - # # disk and then read from - # # if src_name == "None" && dst_name == "None" - # # error( - # # "Location must either be usable as a source or as a destination for data", - # # ) - # # end - - # new( - # src_name, - # dst_name, - # src_parameters, - # dst_parameters, - # total_memory_usage, - # sample - # ) - # end end struct LocationPath @@ -66,8 +40,10 @@ struct LocationPath LocationPath(path) = LocationPath(path, "jl", get_julia_version())`` end +# Functions with `LocationPath`s` + global TABLE_FORMATS = ["csv", "parquet", "arrow"] -z + function get_location_path_with_format(p::String, kwargs...)::LocationPath if isempty(p) return NO_LOCATION_PATH @@ -102,6 +78,16 @@ Base.hash(lp::LocationPath) = lp.path_hash_uint const NO_LOCATION_PATH = LocationPath("", "", "") +# Sample config management + +const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true) +session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG)) + +function set_session_sampling_configs(d::Dict{SessionId,Dict{LocationPath,SamplingConfig}}) + global session_sampling_configs + session_sampling_configs = d +end + get_sampling_config(path="", kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...)) function get_sampling_configs() global session_sampling_configs @@ -112,6 +98,8 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig = get(scs, l_path, scs[NO_LOCATION_PATH]) end +# Getting sample rate + get_sample_rate(p::String=""; kwargs...) = get_sample_rate(get_location_path_with_format(p; kwargs...)) parse_sample_rate(object_key) = @@ -153,6 +141,8 @@ function get_sample_rate(l_path::LocationPath) sample_rate != -1 ? sample_rate : desired_sample_rate end +# Checking for having metadata, samples + function has_metadata(l_path:: LocationPath)::Bool try !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"]) @@ -171,6 +161,8 @@ function has_sample(l_path:: LocationPath)::Bool end end +# Helper function for getting `Location` for location constructors + twodigit(i::Int64) = i < 10 ? ("0" * string(i)) : string(i) get_src_params_dict(d::Union{Nothing,Base.ImmutableDict{String, String}}) = @@ -331,7 +323,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} res_location = LocationSource( get(src_params, "name", "Remote"), src_params, - parse(Int64, get(src_params, "total_memory_usage", "0")), + parse(Int64, get(src_params, "sample_memory_usage", "0")), NOTHING_SAMPLE ) res_location.metadata_invalid = isempty(src_params) diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 7c0a37a2..15a7e442 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -6,13 +6,13 @@ const NOTHING_LOCATION = Location("None", "None", LocationParameters(), Location const INVALID_LOCATION = Location("None", "None", LocationParameters(), LocationParameters(), Int64(-1), NOTHING_SAMPLE, true, true) -Location(name::String, parameters::LocationParameters, total_memory_usage::Int64 = -1, sample::Sample = Sample())::Location = - Location(name, name, parameters, parameters, total_memory_usage, sample, false, false) +Location(name::String, parameters::LocationParameters, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location = + Location(name, name, parameters, parameters, sample_memory_usage, sample, false, false) Base.isnothing(l::Location) = isnothing(l.sample) -LocationSource(name::String, parameters::LocationParameters, total_memory_usage::Int64 = -1, sample::Sample = Sample())::Location = - Location(name, "None", parameters, LocationParameters(), total_memory_usage, sample, false, false) +LocationSource(name::String, parameters::LocationParameters, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location = + Location(name, "None", parameters, LocationParameters(), sample_memory_usage, sample, false, false) LocationDestination( name::String, @@ -31,7 +31,7 @@ function to_jl(lt::Location) # TODO: Instead of computing the total memory usage here, compute it # at the end of each `@partitioned`. That way we will count twice for # mutation - "total_memory_usage" => lt.total_memory_usage == -1 ? nothing : lt.total_memory_usage, + "sample_memory_usage" => lt.sample_memory_usage == -1 ? nothing : lt.sample_memory_usage, ) end @@ -59,7 +59,7 @@ function sourced(fut::Future, loc::Location) "None", loc.src_parameters, Dict{String,Any}(), - loc.total_memory_usage, + loc.sample_memory_usage, if !isnothing(loc.sample.value) # If this location is like some remote location, then we need # a sample from it. @@ -81,7 +81,7 @@ function sourced(fut::Future, loc::Location) fut_location.dst_name, loc.src_parameters, fut_location.dst_parameters, - loc.total_memory_usage, + loc.sample_memory_usage, if !isnothing(loc.sample.value) # If this location is like some remote location, then we need # a sample from it. @@ -114,7 +114,7 @@ function destined(fut::Future, loc::Location) loc.dst_name, EMPTY_DICT, loc.dst_parameters, - fut_location.total_memory_usage, + fut_location.sample_memory_usage, Sample(), loc.metadata_invalid, loc.sample_invalid @@ -129,7 +129,7 @@ function destined(fut::Future, loc::Location) loc.dst_name, fut_location.src_parameters, loc.dst_parameters, - fut_location.total_memory_usage, + fut_location.sample_memory_usage, fut_location.sample, fut_location.metadata_invalid, fut_location.sample_invalid @@ -211,7 +211,7 @@ get_dst_parameters(fut)::LocationParameters = get_location(fut).dst_parameters #################### function Value(val::T)::Location where {T} - LocationSource("Value", Dict{String,Any}("value" => to_jl_value(val)), total_memory_usage(val), ExactSample(val)) + LocationSource("Value", Dict{String,Any}("value" => to_jl_value(val)), sample_memory_usage(val), ExactSample(val)) end # TODO: Implement Size @@ -223,7 +223,7 @@ Size(val)::Location = LocationSource( ) function Client(val::T)::Location where {T} - LocationSource("Client", Dict{String,Any}(), total_memory_usage(val), ExactSample(val)) + LocationSource("Client", Dict{String,Any}(), sample_memory_usage(val), ExactSample(val)) end const CLIENT = Location("None", "Client", LocationParameters(), LocationParameters(), Int64(0), Sample(nothing, Int64(0), Int64(1)), false, false) Client()::Location = deepcopy(CLIENT) diff --git a/Banyan/src/precompile.jl b/Banyan/src/precompile.jl index e3e89a2f..c87e406e 100644 --- a/Banyan/src/precompile.jl +++ b/Banyan/src/precompile.jl @@ -285,8 +285,9 @@ function _precompile_() end # locations.jl - precompile(get_cached_location, (String, Bool, Bool)) - precompile(cache_location, (String, Location, Bool, Bool)) + for lp_func in [get_sample_rate, get_location_source, has_metadata, has_sample] + precompile(lp_func, (LocationPath,)) + end precompile(sample_from_range, (UnitRange{Int64}, Int64)) # utils.jl, utils_s3fs.jl diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl index da4f70f8..7db1bfdc 100644 --- a/Banyan/src/sample.jl +++ b/Banyan/src/sample.jl @@ -12,24 +12,25 @@ mutable struct Sample new(nothing, objectid(nothing), 0, get_sample_rate(), Any[]) # Sample(value::Any) = # new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[]) - function Sample(value::Any, total_memory_usage::Int64, sample_rate::Int64) + function Sample(value::Any, sample_memory_usage::Int64, sample_rate::Int64) # sample_rate = get_sample_rate() - memory_usage = convert(Int64, round(total_memory_usage / sample_rate))::Int64 + memory_usage = convert(Int64, round(sample_memory_usage / sample_rate))::Int64 new(value, objectid(value), memory_usage, sample_rate, Any[]) end function Sample(value::Any, sample_rate::Int64) # This is only for the NOTHING_SAMPLE and ExactSample new(value, objectid(value), sample_memory_usage(value), sample_rate, Any[]) - end + end end +const NOTHING_SAMPLE = Sample(nothing, Int64(-1)) + +Base.isnothing(s::Sample) = s.rate == -1 + struct SamplingConfig rate::Int64 always_exact::Bool max_num_bytes_exact::Int64 force_new_sample_rate::Bool assume_shuffled::Bool -end - -const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true) -session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG)) \ No newline at end of file +end \ No newline at end of file diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 667f5fbd..2e6e6d39 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -68,7 +68,6 @@ impl_error(fn_name, as) = error("$fn_name not implemented for $(typeof(as))") sample_by_key(as::Any, key::Any) = impl_error("sample_by_key", as) sample_axes(as::Any)::Vector{Int64} = impl_error("sample_axes", as) sample_keys(as::Any) = impl_error("sample_keys", as) -sample_memory_usage(as::Any)::Int64 = total_memory_usage(as) # Sample computation functions @@ -200,10 +199,6 @@ function sample_max(A::T, key::K) where {T,K} isempty(A) ? nothing : _maximum(orderinghashes(A, key)) end -const NOTHING_SAMPLE = Sample(nothing, UInt(0), Int64(-1), Int64(-1), Int64[]) - -Base.isnothing(s::Sample) = s.rate == -1 - # Caching samples with same statistics # A sample with memoized statistics for diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index ea1945e5..d0361657 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -175,15 +175,15 @@ function _start_session( environment_hash = get_hash(project_toml * manifest_toml * version) environment_info["environment_hash"] = environment_hash environment_info["project_toml"] = "$(environment_hash)/Project.toml" - file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Project.toml", config=get_aws_config())) + file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Project.toml", config=global_aws_config())) if !file_already_in_s3 - s3_put(get_aws_config(), s3_bucket_name, "$(environment_hash)/Project.toml", project_toml) + s3_put(global_aws_config(), s3_bucket_name, "$(environment_hash)/Project.toml", project_toml) end if manifest_toml != "" environment_info["manifest_toml"] = "$(environment_hash)/Manifest.toml" - file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Manifest.toml", config=get_aws_config())) + file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Manifest.toml", config=global_aws_config())) if !file_already_in_s3 - s3_put(get_aws_config(), s3_bucket_name, "$(environment_hash)/Manifest.toml", manifest_toml) + s3_put(global_aws_config(), s3_bucket_name, "$(environment_hash)/Manifest.toml", manifest_toml) end end else @@ -208,9 +208,9 @@ function _start_session( # Upload files to S3 for f in vcat(files, code_files) - s3_path = S3Path("s3://$(s3_bucket_name)/$(basename(f))", config=get_aws_config()) + s3_path = S3Path("s3://$(s3_bucket_name)/$(basename(f))", config=global_aws_config()) if !isfile(s3_path) || force_update_files - s3_put(get_aws_config(), s3_bucket_name, basename(f), load_file(f)) + s3_put(global_aws_config(), s3_bucket_name, basename(f), load_file(f)) end end # TODO: Optimize so that we only upload (and download onto cluster) the files if the filename doesn't already exist @@ -488,7 +488,7 @@ function download_session_logs(session_id::SessionId, cluster_name::String, file mkdir(joinpath(homedir(), ".banyan", "logs")) end filename = !isnothing(filename) ? filename : joinpath(homedir(), ".banyan", "logs", log_file_name) - s3_get_file(get_aws_config(), s3_bucket_name, log_file_name, filename) + s3_get_file(global_aws_config(), s3_bucket_name, log_file_name, filename) @info "Downloaded logs for session with ID $session_id to $filename" return filename end @@ -496,10 +496,10 @@ end function print_session_logs(session_id, cluster_name, delete_file=true) s3_bucket_name = get_cluster_s3_bucket_name(cluster_name) log_file_name = "banyan-log-for-session-$(session_id)" - logs = s3_get(get_aws_config(), s3_bucket_name, log_file_name) + logs = s3_get(global_aws_config(), s3_bucket_name, log_file_name) println(String(logs)) if delete_file - s3_delete(get_aws_config(), s3_bucket_name, log_file_name) + s3_delete(global_aws_config(), s3_bucket_name, log_file_name) end end diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index 6a609027..6daafc17 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -21,7 +21,7 @@ json_to_jl(j) = JSON.parse(j) key_to_jl(key) = reinterpret(UInt8, hash(string(key))) |> String axis_to_jl(axis) = reinterpret(UInt8, hash(string(key))) |> String -total_memory_usage(val)::Int64 = +sample_memory_usage(val::Any)::Int64 = begin size = Base.summarysize(val) # TODO: Maybe make this larger @@ -97,7 +97,6 @@ end # Banyan.jl may be being used). However, wrapping this in a mutex to ensure # synchronized mutation in this module would be a good TODO. global banyan_config = nothing -global aws_config_in_usage = nothing @nospecialize @@ -220,56 +219,7 @@ end @specialize -""" -Get the value for `key` in the `ini` file for a given `profile`. -""" -function _get_ini_value( - ini::Inifile, profile::String, key::String; default_value=nothing -) - value = get(ini, "profile $profile", key) - value === :notfound && (value = get(ini, profile, key)) - value === :notfound && (value = default_value) - - return value -end - -function get_aws_config()::Dict{Symbol,Any} - global aws_config_in_usage - - # Get AWS configuration - if isnothing(aws_config_in_usage) - # Get region according to ENV, then credentials, then config files - profile = get(ENV, "AWS_DEFAULT_PROFILE", get(ENV, "AWS_DEFAULT_PROFILE", "default")) - region::String = get(ENV, "AWS_DEFAULT_REGION", "") - if region == "" - try - configfile = read(Inifile(), joinpath(homedir(), ".aws", "config")) - region = convert(String, _get_ini_value(configfile, profile, "region", default_value=""))::String - catch - end - end - if region == "" - try - credentialsfile = read(Inifile(), joinpath(homedir(), ".aws", "credentials")) - region = convert(String, _get_ini_value(credentialsfile, profile, "region", default_value=""))::String - catch - end - end - - if region == "" - throw(ErrorException("Could not discover AWS region to use from looking at AWS_PROFILE, AWS_DEFAULT_PROFILE, AWS_DEFAULT_REGION, HOME/.aws/credentials, and HOME/.aws/config")) - end - - aws_config_in_usage = Dict{Symbol,Any}( - :creds => AWSCredentials(), - :region => region - ) - end - - aws_config_in_usage -end - -get_aws_config_region() = get_aws_config()[:region]::String +get_aws_config_region() = global_aws_config().region ######################### # ENVIRONMENT VARIABLES # @@ -445,7 +395,7 @@ function load_json(path::String) elseif startswith(path, "s3://") error("S3 path not currently supported") # TODO: Maybe support with - # `JSON.parsefile(S3Path(path, config=get_aws_config()))` and also down + # `JSON.parsefile(S3Path(path, config=global_aws_config()))` and also down # in `load_toml` elseif startswith(path, "http://") || startswith(path, "https://") JSON.parse(request_body(path)[2]) @@ -462,7 +412,7 @@ function load_toml(path::String) TOML.parsefile(path[8:end]) elseif startswith(path, "s3://") error("S3 path not currently supported") - # JSON.parsefile(S3Path(path, config=get_aws_config())) + # JSON.parsefile(S3Path(path, config=global_aws_config())) elseif startswith(path, "http://") || startswith(path, "https://") TOML.parse(request_body(path)[2]) else diff --git a/Banyan/src/utils_s3fs.jl b/Banyan/src/utils_s3fs.jl index 2878c05c..f3861bc9 100644 --- a/Banyan/src/utils_s3fs.jl +++ b/Banyan/src/utils_s3fs.jl @@ -55,7 +55,7 @@ function download_remote_s3_path(path::String) global failed_to_use_s3fs # Get information about requested object - s3path = S3Path(path, config = get_aws_config()) + s3path = S3Path(path, config = global_aws_config()) bucket = s3path.bucket key = s3path.key # bucket = "banyan-cluster-data-myfirstcluster" @@ -96,8 +96,8 @@ function download_remote_s3_path(path::String) # TODO: Store buckets from different accounts/IAMs/etc. seperately try - ACCESS_KEY_ID = get_aws_config()[:creds].access_key_id - SECRET_ACCESS_KEY = get_aws_config()[:creds].secret_key + ACCESS_KEY_ID = global_aws_config()[:creds].access_key_id + SECRET_ACCESS_KEY = global_aws_config()[:creds].secret_key passwd_s3fs_contents = ACCESS_KEY_ID * ":" * SECRET_ACCESS_KEY HOME = homedir() region = get_aws_config_region() diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index 6531f9b0..61af636e 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -28,7 +28,7 @@ end end function bucket_exists(s3_bucket_name) - ispath(S3Path("s3://$(s3_bucket_name)", config=Banyan.get_aws_config())) + ispath(S3Path("s3://$(s3_bucket_name)", config=Banyan.global_aws_config())) end @testset "Create clusters" begin @@ -55,7 +55,7 @@ end s3_bucket = nothing elseif s3_bucket == "user-provided" s3_bucket = Random.randstring(['a':'z'; '0':'9'], 6) - s3_create_bucket(Banyan.get_aws_config(), s3_bucket) + s3_create_bucket(Banyan.global_aws_config(), s3_bucket) end # Create a cluster (at least initiate) and check that S3 bucket exists @@ -142,8 +142,8 @@ end dst_name = "data_from_s3" src_path = "s3://$s3_bucket/$dst_name" # Create a bucket and upload data - s3_create_bucket(Banyan.get_aws_config(), s3_bucket) - s3_put(Banyan.get_aws_config(), s3_bucket, dst_name, "some file contents") + s3_create_bucket(Banyan.global_aws_config(), s3_bucket) + s3_put(Banyan.global_aws_config(), s3_bucket, dst_name, "some file contents") end cluster_name = ENV["BANYAN_CLUSTER_NAME"] @@ -153,10 +153,10 @@ end @test ispath(S3Path("s3://$cluster_s3_bucket/$dst_name")) # Cleanup - s3_delete(Banyan.get_aws_config(), cluster_s3_bucket, dst_name) + s3_delete(Banyan.global_aws_config(), cluster_s3_bucket, dst_name) if src_type == "s3" - s3_delete(Banyan.get_aws_config(), s3_bucket, dst_name) - s3_delete_bucket(Banyan.get_aws_config(), s3_bucket) + s3_delete(Banyan.global_aws_config(), s3_bucket, dst_name) + s3_delete_bucket(Banyan.global_aws_config(), s3_bucket) end end @@ -178,6 +178,6 @@ end # Cleanup for f_name in readdir(src_path) - s3_delete(Banyan.get_aws_config(), cluster_s3_bucket, "$dst_name/$f_name") + s3_delete(Banyan.global_aws_config(), cluster_s3_bucket, "$dst_name/$f_name") end end \ No newline at end of file diff --git a/Banyan/test/sessions.jl b/Banyan/test/sessions.jl index 6c6e9bbc..738f9837 100644 --- a/Banyan/test/sessions.jl +++ b/Banyan/test/sessions.jl @@ -177,7 +177,7 @@ end println("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)") @test store_logs_in_s3 == isfile( S3Path("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)", - config=Banyan.get_aws_config()) + config=Banyan.global_aws_config()) ) end diff --git a/BanyanDataFrames/src/BanyanDataFrames.jl b/BanyanDataFrames/src/BanyanDataFrames.jl index 69ef44ec..fae29bce 100644 --- a/BanyanDataFrames/src/BanyanDataFrames.jl +++ b/BanyanDataFrames/src/BanyanDataFrames.jl @@ -20,7 +20,7 @@ using Arrow, export DataFrame, GroupedDataFrame # I/O -export read_csv, write_csv, read_parquet, write_parquet, read_arrow, write_arrow +export read_table, write_table, read_csv, write_csv, read_parquet, write_parquet, read_arrow, write_arrow # Dataframe properties export nrow, ncol, size, names, propertynames diff --git a/BanyanDataFrames/src/gdf.jl b/BanyanDataFrames/src/gdf.jl index 7bbe12cf..979f1b46 100644 --- a/BanyanDataFrames/src/gdf.jl +++ b/BanyanDataFrames/src/gdf.jl @@ -9,7 +9,7 @@ end Banyan.convert(::Type{Future}, gdf::GroupedDataFrame) = gdf.data Banyan.isview(gdf::GroupedDataFrame) = true Banyan.sample_memory_usage(gdf::DataFrames.GroupedDataFrame)::Int64 = - total_memory_usage(gdf) - total_memory_usage(parent(gdf)) + sample_memory_usage(gdf) - sample_memory_usage(parent(gdf)) Base.length(gdf::GroupedDataFrame) = compute(gdf.length) Base.size(gdf::GroupedDataFrame) = Tuple(length(gdf)) diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 17bca8f0..c16c9e33 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -102,7 +102,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: else parse(Int64, loc.src_parameters["nrows"]) end - total_nbytes = curr_metadata_invalid ? -1 : parse(Int64, loc.src_parameters["total_memory_usage"]) + total_nbytes = curr_metadata_invalid ? -1 : parse(Int64, loc.src_parameters["sample_memory_usage"]) exact_sample_needed = sampling_config.always_exact || total_nbytes <= max_num_bytes_exact # inv: (a) `meta_nrows_on_worker`, (b) `total_nrows_res`, and @@ -200,7 +200,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: meta_nrows_on_worker[i] = path_nrows push!(local_samples, path_sample) local_nrows += path_nrows - local_nbytes += ceil(Int64, total_memory_usage(path_sample) * path_sample_rate) + local_nbytes += ceil(Int64, sample_memory_usage(path_sample) * path_sample_rate) end total_nrows_res = reduce_and_sync_across(+, local_nrows) total_nbytes_res = reduce_and_sync_across(+, local_nbytes) @@ -264,7 +264,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: remote_sample_value_arrow = io.data # Construct Sample with the concatenated value, memory usage, and sample rate - remote_sample_value_memory_usage = total_memory_usage(remote_sample_value) + remote_sample_value_memory_usage = sample_memory_usage(remote_sample_value) total_nbytes_res = if exact_sample_needed remote_sample_value_memory_usage else @@ -278,7 +278,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: remote_sample_res::Sample = if exact_sample_needed # Technically we don't need to be passing in `total_bytes_res` # here but we do it because we are anyway computing it to - # return as the `total_memory_usage` for the `Location` and so + # return as the `sample_memory_usage` for the `Location` and so # we might as well avoid recomputing it in the `Sample` # constructors ExactSample(remote_sample_value_arrow, total_nbytes_res) @@ -307,7 +307,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: # sample_rate # ) # remote_sample_value_nrows = nrow(cached_remote_sample_res.value) - # remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_res.value) + # remote_sample_value_nbytes = sample_memory_usage(cached_remote_sample_res.value) # if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE # @show remote_sample_value_nbytes remote_sample_value_nrows total_nrows_res # end @@ -323,7 +323,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: # end cached_remote_sample_value = DataFrames.DataFrame(Arrow.Table(sample_path)) - remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_value) + remote_sample_value_nbytes = sample_memory_usage(cached_remote_sample_value) remote_sample_value_nrows = DataFrames.nrow(cached_remote_sample_value) total_nbytes_res = ceil(Int64, remote_sample_value_nbytes * total_nrows_res / remote_sample_value_nrows) cached_remote_sample_res = NOTHING_SAMPLE @@ -341,7 +341,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: src_params = Dict( "name" => "Remote", - "total_memory_usage" => string(total_nbytes), + "sample_memory_usage" => string(total_nbytes), # For dispatching the appropriate PF for this format "format" => format_string, # For constructing the `BanyanDataFrames.DataFrame`'s `nrows::Future` field diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index aa4f453c..f9fe0981 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -515,7 +515,7 @@ function WriteHelper(@nospecialize(format_value)) else Dict( "name" => "Remote", - "total_memory_usage" => "0", + "sample_memory_usage" => "0", "format" => format_string, "nrows" => "0", "path" => loc_params_path, @@ -524,7 +524,7 @@ function WriteHelper(@nospecialize(format_value)) end # Gather # of rows, # of bytes, empty sample, and actual sample - nbytes = part_res isa Empty ? 0 : Banyan.total_memory_usage(part_res) + nbytes = part_res isa Empty ? 0 : Banyan.sample_memory_usage(part_res) sampling_config = get_sampling_config(lp) sample_rate = sampling_config.rate sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows) @@ -549,13 +549,13 @@ function WriteHelper(@nospecialize(format_value)) # Update the # of bytes total_nrows::Int64 = parse(Int64, curr_src_parameters["nrows"]) - total_memory_usage::Int64 = parse(Int64, curr_src_parameters["total_memory_usage"]) + sample_memory_usage::Int64 = parse(Int64, curr_src_parameters["sample_memory_usage"]) empty_sample_found = false for (new_nrows::Int64, new_nbytes::Int64, empty_part, sampled_part) in gathered_data # Update the total # of rows and the total # of bytes total_nrows += sum(new_nrows) push!(curr_nrows, new_nrows) - total_memory_usage += new_nbytes + sample_memory_usage += new_nbytes # Get the empty sample if !empty_sample_found && !(empty_part isa Empty) @@ -564,9 +564,9 @@ function WriteHelper(@nospecialize(format_value)) end end curr_src_parameters["nrows"] = string(total_nrows) - curr_src_parameters["total_memory_usage"] = string(total_memory_usage) + curr_src_parameters["sample_memory_usage"] = string(sample_memory_usage) - if !is_disk && batch_idx == nbatches && total_memory_usage <= sampling_config.max_num_bytes_exact + if !is_disk && batch_idx == nbatches && sample_memory_usage <= sampling_config.max_num_bytes_exact # If the total # of rows turns out to be inexact then we can simply mark it as # stale so that it can be collected more efficiently later on # We should be able to quickly recompute a more useful sample later diff --git a/BanyanDataFrames/test/latency.jl b/BanyanDataFrames/test/latency.jl index 1ac18269..d0e50128 100644 --- a/BanyanDataFrames/test/latency.jl +++ b/BanyanDataFrames/test/latency.jl @@ -41,13 +41,13 @@ end function test_csv_from_s3_latency() use_session_for_testing(scheduling_config_name = "default scheduling", sample_rate=2048*4) do s3_bucket_name = get_cluster_s3_bucket_name() - if !s3_exists(Banyan.get_aws_config(), s3_bucket_name, "nyc_tripdata_small.csv") + if !s3_exists(Banyan.global_aws_config(), s3_bucket_name, "nyc_tripdata_small.csv") data_path = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv" offloaded(s3_bucket_name, data_path) do s3_bucket_name, data_path temp_path = Downloads.download(data_path) cp( Path(temp_path), - S3Path("s3://$s3_bucket_name/nyc_tripdata_small.csv", config=Banyan.get_aws_config()) + S3Path("s3://$s3_bucket_name/nyc_tripdata_small.csv", config=Banyan.global_aws_config()) ) end end diff --git a/BanyanDataFrames/test/runtests.jl b/BanyanDataFrames/test/runtests.jl index ff77fb8d..d81b3c59 100644 --- a/BanyanDataFrames/test/runtests.jl +++ b/BanyanDataFrames/test/runtests.jl @@ -74,7 +74,7 @@ function use_data(file_extension, remote_kind, single_file) ".$file_extension" testing_dataset_s3_path = S3Path( "s3://$(get_cluster_s3_bucket_name())/$testing_dataset_s3_name", - config = Banyan.get_aws_config(), + config = Banyan.global_aws_config(), ) # Create the file if not already created diff --git a/BanyanDataFrames/test/runtests_without_retest.jl b/BanyanDataFrames/test/runtests_without_retest.jl index 7772e755..3c57728a 100644 --- a/BanyanDataFrames/test/runtests_without_retest.jl +++ b/BanyanDataFrames/test/runtests_without_retest.jl @@ -112,11 +112,11 @@ end # path - path to write file to in bucket # download_path - either http(s) link to a file or a local Path indicating the source of the file function verify_file_in_s3(bucket, path, download_path) - if !s3_exists(Banyan.get_aws_config(), bucket, path) + if !s3_exists(Banyan.global_aws_config(), bucket, path) if typeof(download_path) == String && (startswith(download_path, "https://") || startswith(download_path, "http://")) - Downloads.download(download_path, S3Path("s3://$(bucket)/$(path)", config=Banyan.get_aws_config())) + Downloads.download(download_path, S3Path("s3://$(bucket)/$(path)", config=Banyan.global_aws_config())) else # upload local file - cp(Path(download_path), S3Path("s3://$(bucket)/$(path)", config=Banyan.get_aws_config())) + cp(Path(download_path), S3Path("s3://$(bucket)/$(path)", config=Banyan.global_aws_config())) end end end diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index 88a85235..b20813c1 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -33,9 +33,9 @@ # Construct location if reusing != "nothing" - RemoteTableSource(src_name, invalidate_metadata = true, invalidate_sample = true) + RemoteTableSource(src_name) invalidate_location(src_name) - RemoteTableSource(src_name, metadata_invalid = true, sample_invalid = true) + RemoteTableSource(src_name) end if (reusing == "nothing" || reusing == "sample") invalidate_metadata(src_name) @@ -51,10 +51,10 @@ # Verify the location - @test remote_source.total_memory_usage > 0 + @test remote_source.sample_memory_usage > 0 @test !remote_source.metadata_invalid @test !remote_source.sample_invalid - @test remote_source.src_parameters["nrows"] == src_nrows + @test remote_source.src_parameters["nrows"] == string(src_nrows) # if contains(src_name, "dir") # @test length(remote_source.files) == 10 # for f in remote_source.files @@ -78,3 +78,64 @@ end end end + +@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and maximum # of bytes for exact sample" for scheduling_config in + [ + "default scheduling", + "parallelism encouraged", + "parallelism and batches encouraged", + ], + format in ["csv", "parquet"], + max_num_bytes in [0, Banyan.parse_bytes("100 GB")], + shuffled in [true, false] + + use_session_for_testing(scheduling_config_name = scheduling_config) do + use_basic_data() + + bucket = get_cluster_s3_bucket_name() + + invalidate_all_locations() + + p1 = "s3://$(bucket)/iris_large_$format.$format" + p2 = "s3://$(bucket)/iris_large_tmp_$format.$format" + + df = read_table(p1; metadata_invalid=true, invalidate_samples=true) + sample(df) + @show get_sample_rate(p1) + + configure_sampling(p2; sample_rate=5) + write_table(p2, df) + @test get_sample_rate(p2) == 5 + @test has_metadata(p2) + @test has_sample(p2) + invalidate_metadata(p2) + @test !has_metadata(p2) + @test has_sample(p2) + innvalidate_location(p2) + @test !has_metadata(p2) + @test !has_sample(p2) + + df2 = read_table(df2) + @show get_sample_rate(p2) + sample(df2) + @show get_sample_rate(p2) + df2 = read_table(df2; samples_invalid=true) + sample(df2) + configure_sampling(sample_rate=7, for_all_locations=true) + df2 = read_table(df2; metadata_invalid=true) + sample(df2) + @test get_sample_rate() == 5 + configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) + @test get_sample_rate(p2) == 5 + df2 = read_table(df2) + @test get_sample_rate() == 7 + @test get_sample_rate() == 5 + df2 = read_table(df2; location_invalid=true) + sample(df2) + @test has_metadata(p2) + @test has_sample(p2) + @show get_sample_rate(p2) + configure_sampling(p2; always_exact=tru) + sample(df2) + end +end diff --git a/BanyanDataFrames/test/utils_data.jl b/BanyanDataFrames/test/utils_data.jl index 24cfadf6..0f8e0463 100644 --- a/BanyanDataFrames/test/utils_data.jl +++ b/BanyanDataFrames/test/utils_data.jl @@ -2,17 +2,17 @@ # path - path to write file to in bucket # download_path - either http(s) link to a file or a local Path indicating the source of the file function verify_file_in_s3(bucket, path, download_path) - if !s3_exists(Banyan.get_aws_config(), bucket, path) + if !s3_exists(Banyan.global_aws_config(), bucket, path) if typeof(download_path) == String && (startswith(download_path, "https://") || startswith(download_path, "http://")) Downloads.download( download_path, - S3Path("s3://$(bucket)/$(path)", config = Banyan.get_aws_config()), + S3Path("s3://$(bucket)/$(path)", config = Banyan.global_aws_config()), ) else # upload local file cp( Path(download_path), - S3Path("s3://$(bucket)/$(path)", config = Banyan.get_aws_config()), + S3Path("s3://$(bucket)/$(path)", config = Banyan.global_aws_config()), ) end end @@ -58,11 +58,11 @@ function setup_basic_tests(bucket_name=get_cluster_s3_bucket_name()) "iris_species_info.parquet", "iris_species_info.arrow", ] - bucket_contents = s3_list_keys(Banyan.get_aws_config(), bucket_name) + bucket_contents = s3_list_keys(Banyan.global_aws_config(), bucket_name) to_be_downloaded = [ iris_s3_path for iris_s3_path in iris_s3_paths if # TODO: Use the following when AWSS3.jl supports folders - # !s3_exists(Banyan.get_aws_config(), bucket_name, iris_s3_path) + # !s3_exists(Banyan.global_aws_config(), bucket_name, iris_s3_path) !(iris_s3_path in bucket_contents) ] if !isempty(to_be_downloaded) @@ -146,7 +146,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name()) # Write empty dataframe empty_df = DataFrames.DataFrame() println("At start of setup_empty_tests") - if !ispath(S3Path("s3://$bucket_name/empty_df.csv", config = Banyan.get_aws_config())) + if !ispath(S3Path("s3://$bucket_name/empty_df.csv", config = Banyan.global_aws_config())) write_df_to_csv_to_s3( empty_df, "empty_df.csv", @@ -156,7 +156,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name()) ) end println("After first setup_empty_tests") - if !ispath(S3Path("s3://$bucket_name/empty_df.arrow", config = Banyan.get_aws_config())) + if !ispath(S3Path("s3://$bucket_name/empty_df.arrow", config = Banyan.global_aws_config())) write_df_to_arrow_to_s3( empty_df, "empty_df.arrow", @@ -168,7 +168,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name()) # Write empty dataframe with two columns empty_df2 = DataFrames.DataFrame(x = [], y = []) - if !ispath(S3Path("s3://$bucket_name/empty_df2.csv", config = Banyan.get_aws_config())) + if !ispath(S3Path("s3://$bucket_name/empty_df2.csv", config = Banyan.global_aws_config())) write_df_to_csv_to_s3( empty_df2, "empty_df2.csv", @@ -177,7 +177,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name()) "empty_df2.csv", ) end - if !ispath(S3Path("s3://$bucket_name/empty_df2.arrow", config = Banyan.get_aws_config())) + if !ispath(S3Path("s3://$bucket_name/empty_df2.arrow", config = Banyan.global_aws_config())) write_df_to_arrow_to_s3( empty_df2, "empty_df2.arrow", @@ -197,13 +197,13 @@ end # idx = 0 # part_names = [] # while num_bytes_so_far < num_bytes -# dst_path = S3Path("s3://$bucket_name/nyc_tripdata_large.csv/part$idx.csv", config = Banyan.get_aws_config()) +# dst_path = S3Path("s3://$bucket_name/nyc_tripdata_large.csv/part$idx.csv", config = Banyan.global_aws_config()) # if Banyan.INVESTIGATING_SETUP_NYC_TAXI_STRESS_TEST # println("In while loop in setup_nyc_taxi_stress_test") # @show dst_path -# @show !s3_exists(Banyan.get_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv") +# @show !s3_exists(Banyan.global_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv") # end -# if !s3_exists(Banyan.get_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv") +# if !s3_exists(Banyan.global_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv") # if isnothing(nyc_trip_data_120_mb_path) # nyc_trip_data_120_mb_path = Path(download("https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv")) # end @@ -218,13 +218,13 @@ end # println("Outside while loop in setup_nyc_taxi_stress_test") # @show part_names # end -# for p in s3_list_keys(Banyan.get_aws_config(), bucket_name, "nyc_tripdata_large.csv/") +# for p in s3_list_keys(Banyan.global_aws_config(), bucket_name, "nyc_tripdata_large.csv/") # p_str = string(p) # if !any((endswith(p_str, part_name) for part_name in part_names)) # if Banyan.INVESTIGATING_SETUP_NYC_TAXI_STRESS_TEST # println("In final for loop in setup_nyc_taxi_stress_test with p=$p") # end -# s3_delete(Banyan.get_aws_config(), bucket_name, p) +# s3_delete(Banyan.global_aws_config(), bucket_name, p) # end # end # end @@ -277,7 +277,7 @@ function setup_stress_tests(bucket_name=get_cluster_s3_bucket_name()) for filetype in ["csv", "parquet", "arrow"] for ncopy = 1:n_repeats dst_path = "s3://$(bucket_name)/tripdata_large_$(filetype).$(filetype)/tripdata_$(month)_copy$(ncopy).$(filetype)" - dst_s3_path = S3Path(dst_path, config = Banyan.get_aws_config()) + dst_s3_path = S3Path(dst_path, config = Banyan.global_aws_config()) push!(dst_s3_paths, dst_s3_path) if !isfile(dst_s3_path) push!(dst_s3_paths_missing, dst_s3_path) @@ -309,7 +309,7 @@ function setup_stress_tests(bucket_name=get_cluster_s3_bucket_name()) cp( Path(get_local_path_tripdata(s3_path)), s3_path, - config = Banyan.get_aws_config(), + config = Banyan.global_aws_config(), ) end end @@ -319,10 +319,10 @@ end function cleanup_tests(bucket_name=get_cluster_s3_bucket_name()) # Delete all temporary test files that are prepended with "test-tmp__" @show bucket_name - for p in s3_list_keys(Banyan.get_aws_config(), bucket_name) + for p in s3_list_keys(Banyan.global_aws_config(), bucket_name) if contains(string(p), "test-tmp_") - # s3_path = S3Path("s3://$bucket_name/$p", config = Banyan.get_aws_config()) - rm(S3Path("s3://$bucket_name/$p", config = Banyan.get_aws_config()), recursive=true) + # s3_path = S3Path("s3://$bucket_name/$p", config = Banyan.global_aws_config()) + rm(S3Path("s3://$bucket_name/$p", config = Banyan.global_aws_config()), recursive=true) end end end diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 1dc78be0..38371070 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -145,7 +145,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig "subpath" => datasetpath, "eltype" => Banyan.size_to_str(dataszie), "size" => Banyan.type_to_str(dataeltype), - "total_memory_usage" => string(nbytes), + "sample_memory_usage" => string(nbytes), "format" => "hdf5" ) else diff --git a/BanyanHDF5/test/runtests.jl b/BanyanHDF5/test/runtests.jl index 8b155e77..56b16b58 100644 --- a/BanyanHDF5/test/runtests.jl +++ b/BanyanHDF5/test/runtests.jl @@ -101,7 +101,7 @@ function use_data(data_src = "S3") ), ) f_dst = joinpath( - S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.get_aws_config()), + S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.global_aws_config()), "fillval.h5", ) f = get_downloaded_path(f_dst, only_for_writing=true) @@ -115,7 +115,7 @@ function use_data(data_src = "S3") # rm(get_s3fs_path(joinpath(get_cluster_s3_bucket_name(), "fillval_copy.h5")), force=true) rm( joinpath( - S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.get_aws_config()), + S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.global_aws_config()), "fillval_copy.h5", ), force = true, diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index 4d084f2a..783b858f 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -306,7 +306,7 @@ function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfi # regardless of whether we want to get the sample or the metadata _load_img = add_channelview ? _load_image_and_add_channelview : _load_image first_img = is_main ? (localpaths[1] |> _load_img |> _reshape_image) : nothing - exact_sample_needed = is_main ? ((total_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false + exact_sample_needed = is_main ? ((sample_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false exact_sample_needed = sync_across(exact_sample_needed) need_to_parallelize = nimages >= 10 total_num_images_to_read_in = if curr_sample_invalid @@ -365,7 +365,7 @@ function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfi Dict{String,Any}( "name" => "Remote", "nimages" => string(nimages), - "total_memory_usage" => string(nbytes_res), # NOTE: We assume all files have same size + "sample_memory_usage" => string(nbytes_res), # NOTE: We assume all files have same size "size" => size_to_str(datasize_res), "eltype" => type_to_str(dataeltype_res), "add_channelview" => add_channelview ? "1" : "0", diff --git a/BanyanImages/test/locations.jl b/BanyanImages/test/locations.jl index 416ae0a6..274077ab 100644 --- a/BanyanImages/test/locations.jl +++ b/BanyanImages/test/locations.jl @@ -14,7 +14,7 @@ # s = RemoteImageSource(path; metadata_invalid=metadata_invalid, sample_invalid=sample_invalid) # @test s.src_parameters["nimages"] == 1 -# @test s.total_memory_usage == sizeof(ImageCore.RGB{N0f8}) * image_size # exact sample +# @test s.sample_memory_usage == sizeof(ImageCore.RGB{N0f8}) * image_size # exact sample # @test s.src_parameters["nbytes"] == sizeof(ImageCore.RGB{N0f8}) * image_size # @test s.src_parameters["ndims"] == 3 # @test s.src_parameters["size"] == (1, sqrt(image_size), sqrt(image_size)) diff --git a/BanyanImages/test/pfs.jl b/BanyanImages/test/pfs.jl index c53994e6..f41207f2 100644 --- a/BanyanImages/test/pfs.jl +++ b/BanyanImages/test/pfs.jl @@ -29,7 +29,7 @@ # # Construct files # if format == "directory" -# files = readdir(S3Path(path, config=Banyan.get_aws_config())) +# files = readdir(S3Path(path, config=Banyan.global_aws_config())) # datasize = add_channelview ? (nimages, 3, 100, 100) : (nimages, 100, 100) # empty_part_size = add_channelview ? (0, 3, 100, 100) : (0, 100, 100) # elseif format == "generator" diff --git a/BanyanImages/test/utils_data.jl b/BanyanImages/test/utils_data.jl index 863a19c5..0fbe8479 100644 --- a/BanyanImages/test/utils_data.jl +++ b/BanyanImages/test/utils_data.jl @@ -7,7 +7,7 @@ img_len = 100 function write_png_files_to_s3(bucket_name=get_cluster_s3_bucket_name(), nimages=1) global s3_dirs s3_dir_png = s3_dirs["png"] - if length(readdir(S3Path("s3://$bucket_name/$s3_dir_png/", config=Banyan.get_aws_config()))) < nimages + if length(readdir(S3Path("s3://$bucket_name/$s3_dir_png/", config=Banyan.global_aws_config()))) < nimages for i in 1:nimages println("Writing image $i to S3") rand_image = rand(ImageCore.RGB, img_len, img_len) @@ -20,7 +20,7 @@ end function write_jpg_files_to_s3(bucket_name=get_cluster_s3_bucket_name(), nimages=1) global s3_dirs s3_dir_jpg = s3_dirs["jpg"] - if length(readdir(S3Path("s3://$bucket_name/$s3_dir_jpg/", config=Banyan.get_aws_config()))) < nimages + if length(readdir(S3Path("s3://$bucket_name/$s3_dir_jpg/", config=Banyan.global_aws_config()))) < nimages for i in 1:nimages println("Writing image $i to S3") rand_image = rand(ImageCore.RGB, img_len, img_len) @@ -34,8 +34,8 @@ function cleanup_s3_test_files(bucket_name=get_cluster_s3_bucket_name()) global s3_dirs # Delete all files in test_images for (filetype, s3_dir) in s3_dirs - for p in s3_list_keys(Banyan.get_aws_config(), bucket_name, "$s3_dir") - rm(S3Path("s3://$bucket_name/$p", config=Banyan.get_aws_config()), recursive=true) + for p in s3_list_keys(Banyan.global_aws_config(), bucket_name, "$s3_dir") + rm(S3Path("s3://$bucket_name/$p", config=Banyan.global_aws_config()), recursive=true) end end end @@ -79,7 +79,7 @@ function get_test_path(src, format, filetype, nimages, bucket_name) if format == "path" "s3://$bucket_name/$s3_dir/test_image_1.$filetype" elseif format == "directory" || format == "generator" - p = S3Path("s3://$bucket_name/earthdata_jpg_$nimages/", config=Banyan.get_aws_config()) + p = S3Path("s3://$bucket_name/earthdata_jpg_$nimages/", config=Banyan.global_aws_config()) if !isdir(p) mkdir(p) end diff --git a/BanyanONNXRunTime/src/locations.jl b/BanyanONNXRunTime/src/locations.jl index 812ad8f1..35968586 100644 --- a/BanyanONNXRunTime/src/locations.jl +++ b/BanyanONNXRunTime/src/locations.jl @@ -10,7 +10,7 @@ function RemoteONNXSource(remotepath)::Location if p_exists pp = get_downloaded_path(p) model = ONNXRunTime.load_inference(pp) - nbytes = Banyan.total_memory_usage(model) + nbytes = Banyan.sample_memory_usage(model) destroy_downloaded_path(pp) end From f6bb46f860d9fb08be84fadb1a096ffbe9328e82 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 9 Aug 2022 07:55:30 -0700 Subject: [PATCH 11/25] Add BanyanONNXRunTime tests for sampling --- BanyanONNXRunTime/test/onnxruntime.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/BanyanONNXRunTime/test/onnxruntime.jl b/BanyanONNXRunTime/test/onnxruntime.jl index 829cedba..876482d8 100644 --- a/BanyanONNXRunTime/test/onnxruntime.jl +++ b/BanyanONNXRunTime/test/onnxruntime.jl @@ -17,6 +17,13 @@ @test res_size == (120, 2, 3) all_incremented = all(res .== 2) @test all_incremented + + model_sample = sample(model) + res_sample = model_sample(Dict("input" => sample(data)))["output"] + res_size = size(res_sample) + @test res_size == (120, 2, 3) + all_incremented = all(res_sample .== 2) + @test all_incremented end end From 1a3832dc98f0f30c5d1c63e1c26abfb4d78b49d2 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 9 Aug 2022 16:02:24 -0700 Subject: [PATCH 12/25] Fix some bugs and update Arrow version --- Banyan/Project.toml | 10 +-- Banyan/src/Banyan.jl | 8 ++- Banyan/src/location.jl | 26 +++++--- Banyan/src/locations.jl | 26 ++++---- Banyan/src/queues.jl | 8 +-- Banyan/src/requests.jl | 8 +++ Banyan/src/sample.jl | 2 +- Banyan/src/samples.jl | 2 +- Banyan/src/sessions.jl | 6 +- Banyan/src/utils.jl | 6 +- Banyan/test/Project.toml | 4 +- BanyanDataFrames/Project.toml | 2 +- BanyanDataFrames/src/df.jl | 2 +- BanyanDataFrames/test/sample_collection.jl | 18 +++--- BanyanDataFrames/test/utils_data.jl | 1 - BanyanHDF5/src/hdf5.jl | 2 +- BanyanHDF5/src/locations.jl | 2 +- BanyanHDF5/test/hdf5.jl | 73 ++++++++++++++++++++-- BanyanImages/Project.toml | 2 +- BanyanImages/test/jpg.jl | 64 ++++++++++++++++++- 20 files changed, 209 insertions(+), 63 deletions(-) diff --git a/Banyan/Project.toml b/Banyan/Project.toml index 02ad86b3..ef6505fd 100644 --- a/Banyan/Project.toml +++ b/Banyan/Project.toml @@ -7,7 +7,6 @@ version = "0.4.1" AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598" AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" -AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812" Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" @@ -16,7 +15,6 @@ Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" -IniFile = "83e8ac13-25f8-5344-8a64-a9f2b223428f" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LibGit2 = "76f85450-5226-5b5a-8eaa-529ad045b433" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" @@ -31,16 +29,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53" [compat] -AWSCore = "0.6" -AWSS3 = "0.7" -AWSSQS = "0.6" +AWS = "1" +AWSS3 = "0.8" Arrow = "2" DataStructures = "0.18" -Downloads = "^1.4" +Downloads = "1.4" FileIO = "1.9.1" FilePathsBase = "^0.9.15" HTTP = "^0.9.17" -IniFile = "0.5" JSON = "0.21" MPI = "^0.19.0" MethodAnalysis = "0.4" diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 640175cf..4e31eeda 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -37,10 +37,12 @@ using Base64, TOML using AWS -using AWS.AWSServices: s3 +AWS.DEFAULT_BACKEND[] = AWS.DownloadsBackend() +s3 = set_features(AWS.AWSServices.s3; use_response_type=true) +using AWS.AWSExceptions using AWS: @service -@service S3 -@service SQS +@service S3 use_response_type = true +@service SQS use_response_type = true using AWSS3 global BANYAN_API_ENDPOINT diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 85a79dc7..07c665a7 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -44,7 +44,7 @@ end global TABLE_FORMATS = ["csv", "parquet", "arrow"] -function get_location_path_with_format(p::String, kwargs...)::LocationPath +function get_location_path_with_format(p::String; kwargs...)::LocationPath if isempty(p) return NO_LOCATION_PATH end @@ -88,7 +88,7 @@ function set_session_sampling_configs(d::Dict{SessionId,Dict{LocationPath,Sampli session_sampling_configs = d end -get_sampling_config(path="", kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...)) +get_sampling_config(path=""; kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...)) function get_sampling_configs() global session_sampling_configs session_sampling_configs[_get_session_id_no_error()] @@ -102,8 +102,13 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig = get_sample_rate(p::String=""; kwargs...) = get_sample_rate(get_location_path_with_format(p; kwargs...)) -parse_sample_rate(object_key) = - parse(Int64, object_key[(findlast("_", object_key).start+1):end]) +function parse_sample_rate(object_key) + lastpos = findlast("_", object_key) + if isnothing(lastpos) + error("Object name \"$object_key\" doesn't contain a sample rate") + end + parse(Int64, object_key[(lastpos.start+1):end]) +end function get_sample_rate(l_path::LocationPath) # Get the desired sample rate desired_sample_rate = get_sampling_config(l_path).rate @@ -182,6 +187,8 @@ struct AWSExceptionInfo end function get_location_source(lp::LocationPath)::Tuple{Location,String,String} + global s3 + # This checks local cache and S3 cache for sample and metadata files. # It then returns a Location object (with a null sample) and the local file names # to read/write the metadata and sample from/to. @@ -246,7 +253,8 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} found_local_samples = Tuple{String,Int64}[] found_local_sample_rate_diffs = Int64[] samples_local_dir = joinpath(homedir(), ".banyan", "samples") - for local_sample_path in readdir(samples_local_dir, join=true) + local_sample_paths = isdir(samples_local_dir) ? readdir(samples_local_dir, join=true) : String[] + for local_sample_path in local_sample_paths if startswith(local_sample_path, sample_path_prefix) local_sample_rate = parse_sample_rate(object_key) diff_sample_rate = abs(local_sample_rate - desired_sample_rate) @@ -330,7 +338,11 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} res_location.sample_invalid = isempty(final_local_sample_path) ( res_location, - metaata_local_path, - isempty(final_local_sample_path) ? final_local_sample_path : "sample_path_prefix$desired_sample_rate" + metadata_local_path, + if !isempty(final_local_sample_path) + final_local_sample_path + else + joinpath(samples_local_dir, "$sample_path_prefix$desired_sample_rate") + end ) end \ No newline at end of file diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 15a7e442..565e45ac 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -11,7 +11,7 @@ Location(name::String, parameters::LocationParameters, sample_memory_usage::Int6 Base.isnothing(l::Location) = isnothing(l.sample) -LocationSource(name::String, parameters::LocationParameters, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location = +LocationSource(name::String, parameters::Union{Dict{String,Any},Dict{String,String}}, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location = Location(name, "None", parameters, LocationParameters(), sample_memory_usage, sample, false, false) LocationDestination( @@ -350,11 +350,11 @@ function invalidate_location(p; kwargs...) invalidate_metadata(p; kwargs...) invalidate_samples(p; kwargs...) end -function invalidate_all_locations(p; kwargs...) +function invalidate_all_locations() for subdir in ["samples", "metadata"] local_dir = joinpath(homedir(), ".banyan", subdir) - if isdir(samples_local_dir) - rm(local_dir; force=true, recrusive=true) + if isdir(local_dir) + rm(local_dir; force=true, recursive=true) end end @@ -374,14 +374,16 @@ function invalidate_all_locations(p; kwargs...) for d in banyan_samples_objects push!(objects_to_delete, Dict("Key" => d["Key"])) end - try - S3.delete_objects( - banyan_samples_bucket_name(), - Dict("Objects" => objects_to_delete) - ) - catch e - if is_debug_on() - show(e) + if !isempty(objects_to_delete) + try + S3.delete_objects( + banyan_samples_bucket_name(), + Dict("Objects" => objects_to_delete) + ) + catch e + if is_debug_on() + show(e) + end end end end diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index 412aef57..8ced21e5 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -2,9 +2,9 @@ # GET QUEUE URL # ################# -scatter_queue_url()::Dict{Symbol,Any} = get_session().scatter_queue_url -gather_queue_url()::Dict{Symbol,Any} = get_session().gather_queue_url -execution_queue_url()::Dict{Symbol,Any} = get_session().execution_queue_url +scatter_queue_url()::String = get_session().scatter_queue_url +gather_queue_url()::String = get_session().gather_queue_url +execution_queue_url()::String = get_session().execution_queue_url ################### # RECEIVE MESSAGE # @@ -112,8 +112,8 @@ end function sqs_send_message(queue_url, message) generated_message_id = generate_message_id() SQS.send_message( - queue_url, message, + queue_url, Dict( "MessageGroupId" => "1", "MessageDeduplicationId" => generated_message_id diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 64a44981..a8d28ce9 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -287,6 +287,10 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n value_id = message["value_id"]::ValueId num_chunks = message["num_chunks"]::Int64 num_remaining_chunks = num_chunks - 1 + + if is_debug_on() + printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") + end whole_message_contents = if num_chunks > 1 partial_messages = Vector{String}(undef, num_chunks) @@ -704,6 +708,10 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) value_id = message["value_id"]::ValueId num_chunks = message["num_chunks"]::Int64 num_remaining_chunks = num_chunks - 1 + + if is_debug_on() + printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") + end whole_message_contents = if num_chunks > 1 partial_messages = Vector{String}(undef, num_chunks) diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl index 7db1bfdc..81a5d6b1 100644 --- a/Banyan/src/sample.jl +++ b/Banyan/src/sample.jl @@ -27,7 +27,7 @@ const NOTHING_SAMPLE = Sample(nothing, Int64(-1)) Base.isnothing(s::Sample) = s.rate == -1 -struct SamplingConfig +mutable struct SamplingConfig rate::Int64 always_exact::Bool max_num_bytes_exact::Int64 diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 2e6e6d39..47d3a8fd 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -13,7 +13,7 @@ function configure_sampling( sc = get_sampling_config(path; kwargs...) nsc = SamplingConfig( - (!isnothing(sample_rate) && !default) ? rate : sc.rate, + (!isnothing(sample_rate) && !default) ? sample_rate : sc.rate, (!isnothing(always_exact) && !default) ? always_exact : sc.always_exact, (!isnothing(max_num_bytes_exact) && !default) ? max_num_bytes_exact : sc.max_num_bytes_exact, (!isnothing(force_new_sample_rate) && !default) ? force_new_sample_rate : sc.force_new_sample_rate, diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index d0361657..dd6cd25d 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -549,7 +549,7 @@ function get_session_status(session_id::String=get_session_id(); kwargs...)::Str session_status end -function _wait_for_session(session_id::SessionId=get_session_id(), kwargs...) +function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...) sessions_dict = get_sessions_dict() session_status = get_session_status(session_id; kwargs...) p = ProgressUnknown("Preparing session with ID $session_id", spinner=true) @@ -580,7 +580,7 @@ function _wait_for_session(session_id::SessionId=get_session_id(), kwargs...) end end -function wait_for_session(session_id::SessionId=get_session_id(), kwargs...) +function wait_for_session(session_id::SessionId=get_session_id(); kwargs...) sessions_dict = get_sessions_dict() is_session_ready = if haskey(sessions_dict, session_id) session_info::Session = sessions_dict[session_id] @@ -592,7 +592,7 @@ function wait_for_session(session_id::SessionId=get_session_id(), kwargs...) false end if !is_session_ready - _wait_for_session(session_id, kwargs...) + _wait_for_session(session_id; kwargs...) end end diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index 6daafc17..4001469a 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -123,7 +123,7 @@ end get_banyanconfig_path()::String = joinpath(homedir(), ".banyan", "banyanconfig.toml") -configure(; user_id=nothing, api_key=nothing, ec2_key_pair_name=nothing, banyanconfig_path=nothing) = +configure(; user_id=nothing, api_key=nothing, ec2_key_pair_name=nothing, banyanconfig_path=nothing, kwargs...) = configure( isnothing(user_id) ? "" : user_id, isnothing(api_key) ? "" : api_key, @@ -200,7 +200,7 @@ end # Getting organization IDs -organization_ids = Dict{String,String} +organization_ids = Dict{String,String}() function get_organization_id() global organization_ids global sessions @@ -209,7 +209,7 @@ function get_organization_id() if haskey(organization_ids, user_id) organization_ids[user_id] elseif haskey(sessions, session_id) - sessions[session_id].organization_ids + sessions[session_id].organization_id else organization_id = send_request_get_response(:describe_users, Dict())["organization_id"] organization_ids[user_id] = organization_id diff --git a/Banyan/test/Project.toml b/Banyan/test/Project.toml index dd440603..61c14273 100644 --- a/Banyan/test/Project.toml +++ b/Banyan/test/Project.toml @@ -10,7 +10,6 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" -IniFile = "83e8ac13-25f8-5344-8a64-a9f2b223428f" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Parquet = "626c502c-15b0-58ad-a749-f091afb673ae" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" @@ -23,13 +22,12 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53" [compat] -Arrow = "1.5.0" +Arrow = "2" CSV = "0.9.5" DataFrames = "1" Downloads = "1.4" FileIO = "1.9.1" FilePathsBase = "^0.9.15" -IniFile = "0.5.0" JSON = "0.21.1" Parquet = "0.8.3" ReTest = "0.3.2" diff --git a/BanyanDataFrames/Project.toml b/BanyanDataFrames/Project.toml index 64dcbab1..0de977b7 100644 --- a/BanyanDataFrames/Project.toml +++ b/BanyanDataFrames/Project.toml @@ -21,7 +21,7 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] -Arrow = "^1.6" +Arrow = "2" Banyan = "0.4.1" BanyanArrays = "0.4.1" DataFrames = "1" diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl index b51eb4f0..3c604963 100644 --- a/BanyanDataFrames/src/df.jl +++ b/BanyanDataFrames/src/df.jl @@ -50,7 +50,7 @@ Base.propertynames(df::DataFrame) = propertynames(sample(df)::DataFrames.DataFra function read_table(path::String; kwargs...) @nospecialize invalidate(path; kwargs...) - df_loc = RemoteTableSource(path; kwargs...) + df_loc = RemoteTableSource(path) df_loc.src_name == "Remote" || error("$path does not exist") invalidate(path; after=true, kwargs...) df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"]) diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index b20813c1..3b772ff8 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -115,27 +115,29 @@ end @test !has_metadata(p2) @test !has_sample(p2) - df2 = read_table(df2) + df2 = read_table(p2) @show get_sample_rate(p2) sample(df2) @show get_sample_rate(p2) - df2 = read_table(df2; samples_invalid=true) + df2 = read_table(p2; samples_invalid=true) sample(df2) configure_sampling(sample_rate=7, for_all_locations=true) - df2 = read_table(df2; metadata_invalid=true) + df2 = read_table(p2; metadata_invalid=true) sample(df2) - @test get_sample_rate() == 5 + @test get_sample_rate(p2) == 5 + @test get_sample_rate() == 7 configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) @test get_sample_rate(p2) == 5 - df2 = read_table(df2) @test get_sample_rate() == 7 - @test get_sample_rate() == 5 - df2 = read_table(df2; location_invalid=true) + df2 = read_table(p2) + @test get_sample_rate(p2) == 7 + @test get_sample_rate() == 7 + df2 = read_table(p2; location_invalid=true) sample(df2) @test has_metadata(p2) @test has_sample(p2) @show get_sample_rate(p2) - configure_sampling(p2; always_exact=tru) + configure_sampling(p2; always_exact=true) sample(df2) end end diff --git a/BanyanDataFrames/test/utils_data.jl b/BanyanDataFrames/test/utils_data.jl index 0f8e0463..4bae4562 100644 --- a/BanyanDataFrames/test/utils_data.jl +++ b/BanyanDataFrames/test/utils_data.jl @@ -318,7 +318,6 @@ end function cleanup_tests(bucket_name=get_cluster_s3_bucket_name()) # Delete all temporary test files that are prepended with "test-tmp__" - @show bucket_name for p in s3_list_keys(Banyan.global_aws_config(), bucket_name) if contains(string(p), "test-tmp_") # s3_path = S3Path("s3://$bucket_name/$p", config = Banyan.global_aws_config()) diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl index 3bef15de..ecd529b4 100644 --- a/BanyanHDF5/src/hdf5.jl +++ b/BanyanHDF5/src/hdf5.jl @@ -1,6 +1,6 @@ function read_hdf5(path; kwargs...) invalidate(path; kwargs...) - A_loc = RemoteHDF5Source(path; kwargs...) + A_loc = RemoteHDF5Source(path) A_loc.src_name == "Remote" || error("$path does not exist") invalidate(path; after=true, kwargs...) A = Future(datatype="Array", source=A_loc) diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 38371070..82142481 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -138,7 +138,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig if is_main # Construct parameters for Location src_params = if curr_metadata_invalid - Dict{String,String}( + Dict{String,Any}( "name" => "Remote", "path_and_subpath" => path_and_subpath, "path" => remotepath, diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl index 41b6d969..8080db4a 100644 --- a/BanyanHDF5/test/hdf5.jl +++ b/BanyanHDF5/test/hdf5.jl @@ -6,7 +6,7 @@ src in ["Internet", "S3"] use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do use_data() - set_max_exact_sample_length(128) + configure_sampling(max_num_bytes_exact=0) for _ in 1:2 src_is_s3 = src == "S3" @@ -27,7 +27,72 @@ src in ["Internet", "S3"] @test x_sum_collect == (src_is_s3 ? 12840 : 32100000) end - set_max_exact_sample_length(2048) + configure_sampling(default=true) + end +end + +# TODO: Add tests here modeled after BDF.jl + +@testset "Reading and sampling HDF5 in $src with $scheduling_config with max_num_bytes_exact=$max_num_bytes and shuffled=$shuffled" for scheduling_config in [ + "default scheduling", + "parallelism encouraged", + "parallelism and batches encouraged", +], +src in ["Internet", "S3"], +max_num_bytes in [0, Banyan.parse_bytes("100 GB")], +shuffled in [true, false] + get_organization_id() + use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do + invalidate_all_locations() + use_data() + configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled) + + p = if src == "S3" + joinpath("s3://", get_cluster_s3_bucket_name(), "fillval.h5/DS1") + else + joinpath("https://github.com/banyan-team/banyan-julia/raw/v0.1.1/BanyanArrays/test/res", "fillval.h5/DS1") + end + + x = read_hdf5(p) + sample(x) + @show get_sample_rate(x) + + configure_sampling(p; sample_rate=5) + x = read_hdf5(p) + @test get_sample_rate(p) == 5 + @test has_metadata(p) + @test has_sample(p) + invalidate_metadata(p) + @test !has_metadata(p) + @test has_sample(p) + innvalidate_location(p) + @test !has_metadata(p) + @test !has_sample(p) + + x = read_hdf5(p) + @show get_sample_rate(p) + sample(x) + @show get_sample_rate(p) + x = read_hdf5(p; samples_invalid=true) + sample(x) + configure_sampling(sample_rate=7, for_all_locations=true) + x = read_hdf5(p; metadata_invalid=true) + sample(x) + @test get_sample_rate(p) == 5 + @test get_sample_rate() == 7 + configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) + @test get_sample_rate(p) == 5 + @test get_sample_rate() == 7 + x = read_hdf5(p) + @test get_sample_rate(p) == 7 + @test get_sample_rate() == 7 + x = read_hdf5(p; location_invalid=true) + sample(x) + @test has_metadata(p) + @test has_sample(p) + @show get_sample_rate(p) + configure_sampling(p; always_exact=true) + sample(x) end end @@ -41,7 +106,7 @@ end use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do use_data(src) - set_max_exact_sample_length(128) + configure_sampling(max_num_bytes_exact=0) # Determine where to read from @@ -169,6 +234,6 @@ end # end # end - set_max_exact_sample_length(2048) + configure_sampling(default=true) end end diff --git a/BanyanImages/Project.toml b/BanyanImages/Project.toml index 1fb3dc44..cf7cbabd 100644 --- a/BanyanImages/Project.toml +++ b/BanyanImages/Project.toml @@ -17,7 +17,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Arrow = "^1.6" +Arrow = "2" Banyan = "0.4.1" BanyanArrays = "0.4.1" FileIO = "1.9.1" diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl index e8a9a2f4..bdf0ae26 100644 --- a/BanyanImages/test/jpg.jl +++ b/BanyanImages/test/jpg.jl @@ -54,7 +54,7 @@ invalid_bool_to_str(metadata_invalid) = metadata_invalid ? "invalid" : "valid" add_channelview in [true, false], metadata_invalid in [true, false], sample_invalid in [true, false], - nimages in [75, 5] + nimages in [75, 50] # TODO: Test exact sample collection and also replicated with batch image computation use_session_for_testing(sample_rate = 75) do bucket_name = get_cluster_s3_bucket_name() @@ -79,6 +79,68 @@ invalid_bool_to_str(metadata_invalid) = metadata_invalid ? "invalid" : "valid" end end +@testset "Reading and sampling $nimage JPG images on $loc with $format and add_channelview=$add_channelview, max_num_bytes=$max_num_bytes, shuffled=$shuffled" for + (loc, format) in [ + ("Internet", "generator"), + ("S3", "generator"), + ("S3", "directory") + ], + max_num_bytes in [0, Banyan.parse_bytes("100 GB")], + shuffled in [true, false], + nimages in [1, 50], + add_channelview in [true, false] + get_organization_id() + use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do + bucket_name = get_cluster_s3_bucket_name() + invalidate_all_locations() + configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled) + + p = get_test_path(loc, "generator", "jpg", nimages, bucket_name) + + x = read_jpg(p; add_channelview=add_channelview) + sample(x) + @show get_sample_rate(x) + + # TODO: Ensure that this triggers parallel cluster<->client data transfer + configure_sampling(p; sample_rate=20) + x = read_jpg(p; add_channelview=add_channelview) + @test get_sample_rate(p) == 20 + @test has_metadata(p) + @test has_sample(p) + invalidate_metadata(p) + @test !has_metadata(p) + @test has_sample(p) + innvalidate_location(p) + @test !has_metadata(p) + @test !has_sample(p) + + x = read_jpg(p; add_channelview=add_channelview) + @show get_sample_rate(p) + sample(x) + @show get_sample_rate(p) + x = read_jpg(p; add_channelview=add_channelview, samples_invalid=true) + sample(x) + configure_sampling(sample_rate=75, for_all_locations=true) + x = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true) + sample(x) + @test get_sample_rate(p) == 50 + @test get_sample_rate() == 75 + configure_sampling(sample_rate=75, force_new_sample_rate=true, for_all_locations=true) + @test get_sample_rate(p) == 50 + @test get_sample_rate() == 75 + x = read_jpg(p; add_channelview=add_channelview) + @test get_sample_rate(p) == 75 + @test get_sample_rate() == 75 + x = read_jpg(p; add_channelview=add_channelview, location_invalid=true) + sample(x) + @test has_metadata(p) + @test has_sample(p) + @show get_sample_rate(p) + configure_sampling(p; always_exact=true) + sample(x) + end +end + # @testset "Reading/writing JPG $src through $format" for (src, format) in # ] # # TODO: read From 47a48234127f67c6c9523f8485113d53dadee7d9 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 9 Aug 2022 19:24:01 -0700 Subject: [PATCH 13/25] Fix send_to_client --- Banyan/src/queues.jl | 17 ++++++++++++++--- Banyan/src/requests.jl | 6 ++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index 8ced21e5..f565277c 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -124,6 +124,7 @@ end function send_to_client(value_id::ValueId, value, worker_memory_used = 0) MAX_MESSAGE_LENGTH = 220_000 message = to_jl_string(value)::String + generated_message_id = generate_message_id() # Break the message down into chunk ranges nmessages = 0 @@ -140,7 +141,7 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) message_i += MAX_MESSAGE_LENGTH message_length -= MAX_MESSAGE_LENGTH end - push!(message_ranges, starti:message_i) + push!(message_ranges, starti:(message_i-1)) nmessages += 1 if is_last_message break @@ -150,6 +151,7 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) # Launch asynchronous threads to send SQS messages gather_q_url = gather_queue_url() num_chunks = length(message_ranges) + @show num_chunks if num_chunks > 1 @sync for i = 1:message_ranges @async begin @@ -165,8 +167,13 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) SQS.send_message( msg_json, gather_q_url, - Dict("MessageGroupId" => string(i)) + Dict( + "MessageGroupId" => string(i), + "MessageDeduplicationId" => generated_message_id * string(i) + ) ) + @show msg + @show i end end else @@ -179,11 +186,15 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) "chunk_idx" => i, "num_chunks" => num_chunks ) + @show msg msg_json = JSON.json(msg) SQS.send_message( msg_json, gather_q_url, - Dict("MessageGroupId" => string(i)) + Dict( + "MessageGroupId" => string(i), + "MessageDeduplicationId" => generated_message_id * string(i) + ) ) end end diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index a8d28ce9..07d50e60 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -291,6 +291,8 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n if is_debug_on() printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") end + + @show num_chunks whole_message_contents = if num_chunks > 1 partial_messages = Vector{String}(undef, num_chunks) @@ -299,6 +301,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n @async begin partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) chunk_idx = partial_message["chunk_idx"] + @show chunk_idx partial_messages[chunk_idx] = message["contents"] end end @@ -713,6 +716,8 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") end + @show num_chunks + whole_message_contents = if num_chunks > 1 partial_messages = Vector{String}(undef, num_chunks) partial_messages[message["chunk_idx"]] = message["contents"] @@ -720,6 +725,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) @async begin partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) chunk_idx = partial_message["chunk_idx"] + @show chunk_idx partial_messages[chunk_idx] = message["contents"] end end From 91245c527292d8ddb35987779f5ed03439007faf Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Wed, 10 Aug 2022 09:53:06 -0700 Subject: [PATCH 14/25] Fix SamplingConfig serialization --- Banyan/src/Banyan.jl | 1 + Banyan/src/location.jl | 4 ++-- Banyan/src/requests.jl | 4 ++-- Banyan/src/session.jl | 6 +++--- Banyan/src/utils.jl | 18 ++++++++++-------- BanyanDataFrames/Project.toml | 1 + BanyanDataFrames/src/BanyanDataFrames.jl | 1 + BanyanDataFrames/src/locations.jl | 5 +++-- BanyanDataFrames/test/sample_collection.jl | 4 ++-- 9 files changed, 25 insertions(+), 19 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 4e31eeda..388c0083 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -100,6 +100,7 @@ export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata export LocationPath, SamplingConfig export has_metadata, has_sample, get_sample_rate, configure_sampling export type_to_str, str_to_type +export banyan_metadata_bucket_name, banyan_samples_bucket_name, get_metadata_path, get_sample_path_prefix, get_sample_path # Serialization export from_jl_string, to_jl_string diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 07c665a7..2b2f7f61 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -83,9 +83,9 @@ const NO_LOCATION_PATH = LocationPath("", "", "") const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true) session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG)) -function set_session_sampling_configs(d::Dict{SessionId,Dict{LocationPath,SamplingConfig}}) +function set_sampling_configs(d::Dict{LocationPath,SamplingConfig}) global session_sampling_configs - session_sampling_configs = d + session_sampling_configs[_get_session_id_no_error()] = d end get_sampling_config(path=""; kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...)) diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 07d50e60..6da8d9c4 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -713,7 +713,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) num_remaining_chunks = num_chunks - 1 if is_debug_on() - printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") + println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") end @show num_chunks @@ -742,7 +742,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) # call to `send_evaluation` end - error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time) + error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, whole_message_contents, error_for_main_stuck, error_for_main_stuck_time) elseif (message_type == "EVALUATION_END") if message["end"]::Bool == true return stored_message diff --git a/Banyan/src/session.jl b/Banyan/src/session.jl index cbec24ec..9e781299 100644 --- a/Banyan/src/session.jl +++ b/Banyan/src/session.jl @@ -61,13 +61,13 @@ mutable struct Session end function sampling_configs_to_jl(sampling_configs::Dict{LocationPath,SamplingConfig}) - res = Tuple{Tuple{String,String,String},Tuple{Int64,Bool,Int64,Bool}}[] + res = Tuple{Tuple{String,String,String},Tuple{Int64,Bool,Int64,Bool,Bool}}[] for (l::LocationPath, s::SamplingConfig) in sampling_configs push!( res, ( (l.original_path, l.format_name, l.format_version), - (s.rate, s.always_exact, s.max_num_bytes_exact, s.force_new_sample_rate), + (s.rate, s.always_exact, s.max_num_bytes_exact, s.force_new_sample_rate, s.assume_shuffled), ), ) end @@ -77,7 +77,7 @@ end function sampling_configs_from_jl(sampling_configs) res = Dict{LocationPath,SamplingConfig}() for (l, s) in sampling_configs - res[LocationPath(l[1], l[2], l[3])] = SamplingConfig(s[1], s[2], s[3], s[4]) + res[LocationPath(l[1], l[2], l[3])] = SamplingConfig(s[1], s[2], s[3], s[4], s[5]) end res end \ No newline at end of file diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index 4001469a..cec3e57c 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -165,7 +165,7 @@ function configure(user_id, api_key, ec2_key_pair_name, banyanconfig_path) end # Check banyanconfig file - banyan_config_has_info = !(isempty(banyan_config) || isempty(banyan_config)) + banyan_config_has_info = !isnothing(banyan_config) && !isempty(banyan_config) if isempty(user_id) && banyan_config_has_info && haskey(banyan_config, "banyan") && haskey(banyan_config["banyan"], "user_id") user_id = banyan_config["banyan"]["user_id"] end @@ -204,16 +204,18 @@ organization_ids = Dict{String,String}() function get_organization_id() global organization_ids global sessions - user_id = configure()["banyan"]["user_id"] session_id = _get_session_id_no_error() - if haskey(organization_ids, user_id) - organization_ids[user_id] - elseif haskey(sessions, session_id) + if haskey(sessions, session_id) sessions[session_id].organization_id else - organization_id = send_request_get_response(:describe_users, Dict())["organization_id"] - organization_ids[user_id] = organization_id - organization_id + user_id = configure()["banyan"]["user_id"] + if haskey(organization_ids, user_id) + organization_ids[user_id] + else + organization_id = send_request_get_response(:describe_users, Dict())["organization_id"] + organization_ids[user_id] = organization_id + organization_id + end end end diff --git a/BanyanDataFrames/Project.toml b/BanyanDataFrames/Project.toml index 0de977b7..963f2ecd 100644 --- a/BanyanDataFrames/Project.toml +++ b/BanyanDataFrames/Project.toml @@ -7,6 +7,7 @@ version = "0.4.1" Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317" BanyanArrays = "369465de-032e-4609-9dcf-82b89c370a7b" +Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" diff --git a/BanyanDataFrames/src/BanyanDataFrames.jl b/BanyanDataFrames/src/BanyanDataFrames.jl index fae29bce..2c6b1a85 100644 --- a/BanyanDataFrames/src/BanyanDataFrames.jl +++ b/BanyanDataFrames/src/BanyanDataFrames.jl @@ -3,6 +3,7 @@ module BanyanDataFrames using Arrow, Banyan, BanyanArrays, + Base64, DataFrames, Dates, Downloads, diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index c16c9e33..398d4dc9 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -8,6 +8,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact # TODO: Replace `max_exact_sample_length` with `max_num_bytes_exact` is_main = is_main_worker() + sample_rate = sampling_config.rate # Get cached Location and if it has valid parameters and sample, return curr_metadata_invalid, curr_sample_invalid = loc.metadata_invalid, loc.sample_invalid @@ -21,7 +22,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: # Get paths for writing sample and metadata metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" - sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)" + sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate" # Get metadata if it is still valid curr_meta::Arrow.Table = if !curr_metadata_invalid @@ -367,7 +368,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: # Write the sample to S3 cache if previously invalid if curr_sample_invalid - write(sample_path, remote_sample.value.data) + write(sample_path, remote_sample.value) end if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index 3b772ff8..4a18f3b1 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -96,8 +96,8 @@ end invalidate_all_locations() - p1 = "s3://$(bucket)/iris_large_$format.$format" - p2 = "s3://$(bucket)/iris_large_tmp_$format.$format" + p1 = "s3://$(bucket)/iris_large.$format" + p2 = "s3://$(bucket)/iris_large_tmp.$format" df = read_table(p1; metadata_invalid=true, invalidate_samples=true) sample(df) From cbcdd7c8b1c84427c4b6335a5de9d1734c8c9486 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Wed, 10 Aug 2022 22:53:51 -0700 Subject: [PATCH 15/25] Fix bugs --- Banyan/src/Banyan.jl | 2 +- Banyan/src/location.jl | 28 +++++++++++++++++++--- Banyan/src/locations.jl | 6 ++--- Banyan/src/queues.jl | 2 -- Banyan/src/requests.jl | 16 +++++-------- BanyanDataFrames/src/locations.jl | 2 +- BanyanDataFrames/src/pfs.jl | 12 ++++++---- BanyanDataFrames/test/sample_collection.jl | 19 +++++++++++---- BanyanHDF5/src/locations.jl | 2 +- BanyanHDF5/test/hdf5.jl | 2 +- BanyanImages/test/jpg.jl | 2 +- 11 files changed, 62 insertions(+), 31 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 388c0083..1cdba8ed 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -98,7 +98,7 @@ export invalidate_all_locations, invalidate_location, invalidate_metadata, inval export NOTHING_LOCATION, INVALID_LOCATION, NO_LOCATION_PATH export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata export LocationPath, SamplingConfig -export has_metadata, has_sample, get_sample_rate, configure_sampling +export has_metadata, has_sample, get_sample_rate, configure_sampling, get_sampling_config, get_sampling_configs, set_sampling_configs export type_to_str, str_to_type export banyan_metadata_bucket_name, banyan_samples_bucket_name, get_metadata_path, get_sample_path_prefix, get_sample_path diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 2b2f7f61..1af470c9 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -128,7 +128,8 @@ function get_sample_rate(l_path::LocationPath) banyan_samples_objects = try res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"] res isa Base.Vector ? res : [res] - catch + catch e + @show e return desired_sample_rate end sample_rate = -1 @@ -148,7 +149,10 @@ end # Checking for having metadata, samples +has_metadata(p::String=""; kwargs...) = + has_metadata(get_location_path_with_format(p; kwargs...)) function has_metadata(l_path:: LocationPath)::Bool + println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path))") try !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"]) catch @@ -156,6 +160,8 @@ function has_metadata(l_path:: LocationPath)::Bool end end +has_sample(p::String=""; kwargs...) = + has_sample(get_location_path_with_format(p; kwargs...)) function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) @@ -186,6 +192,22 @@ struct AWSExceptionInfo end end +function get_metadata_local_path() + p = joinpath(homedir(), ".banyan", "metadata") + if !isdir(p) + mkpath(p) + end + p +end + +function get_samples_local_path() + p = joinpath(homedir(), ".banyan", "metadata") + if !isdir(p) + mkpath(p) + end + p +end + function get_location_source(lp::LocationPath)::Tuple{Location,String,String} global s3 @@ -195,7 +217,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} # Load in metadata metadata_path = get_metadata_path(lp) - metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path) + metadata_local_path = joinpath(get_metadata_local_path(), metadata_path) metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path" src_params_not_stored_locally = false src_params::Dict{String, String} = if isfile(metadata_local_path) @@ -252,7 +274,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} # Find local samples found_local_samples = Tuple{String,Int64}[] found_local_sample_rate_diffs = Int64[] - samples_local_dir = joinpath(homedir(), ".banyan", "samples") + samples_local_dir = get_samples_local_path() local_sample_paths = isdir(samples_local_dir) ? readdir(samples_local_dir, join=true) : String[] for local_sample_path in local_sample_paths if startswith(local_sample_path, sample_path_prefix) diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 565e45ac..7e4b8e6e 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -16,7 +16,7 @@ LocationSource(name::String, parameters::Union{Dict{String,Any},Dict{String,Stri LocationDestination( name::String, - parameters::LocationParameters + parameters::Union{Dict{String,Any},Dict{String,String}} )::Location = Location("None", name, LocationParameters(), parameters, -1, Sample(), false, false) function to_jl(lt::Location) @@ -316,8 +316,8 @@ function invalidate_samples(p; kwargs...) # Delete locally samples_local_dir = joinpath(homedir(), ".banyan", "samples") + sample_path_prefix = get_sample_path_prefix(lp) if isdir(samples_local_dir) - sample_path_prefix = get_sample_path_prefix(lp) for local_sample_path in readdir(samples_local_dir, join=true) if startswith(local_sample_path, sample_path_prefix) rm(local_sample_path) @@ -463,7 +463,7 @@ function RemoteSource( # Look at local and S3 caches of metadata and samples to attempt to # construct a Location. loc, local_metadata_path, local_sample_path = get_location_source(lp) - sc = get_sampling_config(lp) + sc = deepcopy(get_sampling_config(lp)) sc.rate = parse_sample_rate(local_sample_path) if !loc.metadata_invalid && !loc.sample_invalid diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index f565277c..466ff88c 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -172,7 +172,6 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) "MessageDeduplicationId" => generated_message_id * string(i) ) ) - @show msg @show i end end @@ -186,7 +185,6 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) "chunk_idx" => i, "num_chunks" => num_chunks ) - @show msg msg_json = JSON.json(msg) SQS.send_message( msg_json, diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 6da8d9c4..32d67630 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -289,7 +289,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n num_remaining_chunks = num_chunks - 1 if is_debug_on() - printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") + println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") end @show num_chunks @@ -318,7 +318,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n # call to `send_evaluation` end - error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time) + error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, whole_message_contents, error_for_main_stuck, error_for_main_stuck_time) elseif message_type == "EVALUATION_END" if message["end"]::Bool == true break @@ -700,7 +700,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) session = get_session() gather_queue = gather_queue_url() - stored_message = nothing + stored_res = nothing error_for_main_stuck, error_for_main_stuck_time = nothing, nothing partial_gathers = Dict{ValueId,String}() while true @@ -734,18 +734,14 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) message["contents"] end - if haskey(session.futures_on_client, value_id) - value = from_jl_string(whole_message_contents) - f = session.futures_on_client[value_id]::Future - f.value = value - # TODO: Update stale/mutated here to avoid costly - # call to `send_evaluation` + if value_id == "-1" + stored_res = from_jl_string(whole_message_contents) end error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, whole_message_contents, error_for_main_stuck, error_for_main_stuck_time) elseif (message_type == "EVALUATION_END") if message["end"]::Bool == true - return stored_message + return stored_res end end end diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 398d4dc9..e1a71efb 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -424,6 +424,6 @@ RemoteTableDestination(remotepath)::Location = Dict( "format" => get_file_ending(remotepath), "nrows" => "0", - "path" => remotepath, + "path" => remotepath ), ) \ No newline at end of file diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index f9fe0981..b324c4ba 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -223,7 +223,7 @@ function ReadBlockHelper(@nospecialize(format_value)) nworkers = get_nworkers(comm) npartitions = nbatches * nworkers partition_idx = get_partition_idx(batch_idx, nbatches, comm) - nrows::Int64 = meta_nrows + nrows::Int64 = length(meta_nrows) rows_per_partition = cld(nrows, npartitions) sorting_perm = sortperm(meta_nrows, rev=true) files_by_partition = Base.Vector{Int64}[] @@ -335,7 +335,7 @@ function ReadBlockHelper(@nospecialize(format_value)) dfs = Base.Vector{Any}(undef, ndfs) if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND - @show (files_to_read, get_worker_idx()) + @show (filezs_to_read, get_worker_idx()) end # Iterate through files and identify which ones correspond to the range of @@ -484,6 +484,10 @@ function WriteHelper(@nospecialize(format_value)) # SAMPLE/METADATA COLLECTIOM AND STORAGE # ########################################## + # Get sampling configuration + sampling_config = get_sampling_config(lp) + sample_rate = sampling_config.rate + # Get paths for reading in metadata and Location tmp_suffix = nbatches > 1 ? ".tmp" : "" lp_tmp = LocationPath(loc_params_path * tmp_suffix, "arrow", "2") @@ -525,8 +529,6 @@ function WriteHelper(@nospecialize(format_value)) # Gather # of rows, # of bytes, empty sample, and actual sample nbytes = part_res isa Empty ? 0 : Banyan.sample_memory_usage(part_res) - sampling_config = get_sampling_config(lp) - sample_rate = sampling_config.rate sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows) gathered_data = gather_across((nrows, nbytes, part_res isa Empty ? part_res : empty(part_res), sampled_part), comm) @@ -574,6 +576,8 @@ function WriteHelper(@nospecialize(format_value)) sample_invalid = true end + println("In Write with sample_invalid=$sample_invalid and sample_memory_usage=$sample_memory_usage while sampling_config=$sampling_config, writing to $m_path") + # Get the actual sample by concatenating if !is_disk && !sample_invalid sampled_parts = [gathered[4] for gathered in gathered_data] diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index 4a18f3b1..72c0ba7f 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -94,6 +94,9 @@ end bucket = get_cluster_s3_bucket_name() + configure_sampling(max_num_bytes=max_num_bytes, always_shuffled=shuffled) + exact_sample = max_num_bytes > 0 + invalidate_all_locations() p1 = "s3://$(bucket)/iris_large.$format" @@ -104,24 +107,32 @@ end @show get_sample_rate(p1) configure_sampling(p2; sample_rate=5) - write_table(p2, df) + @show get_sampling_configs() + write_table(df, p2) + @show get_sampling_configs() @test get_sample_rate(p2) == 5 @test has_metadata(p2) - @test has_sample(p2) + @test has_sample(p2) == !exact_sample invalidate_metadata(p2) @test !has_metadata(p2) - @test has_sample(p2) - innvalidate_location(p2) + @test has_sample(p2) == !exact_sample + invalidate_location(p2) @test !has_metadata(p2) @test !has_sample(p2) + @show get_sample_rate(p2) df2 = read_table(p2) + @show Banyan.get_location_path_with_format(p2) + @show get_sampling_configs() + @show get_sampling_config(p2) @show get_sample_rate(p2) sample(df2) @show get_sample_rate(p2) df2 = read_table(p2; samples_invalid=true) sample(df2) + @test get_sample_rate(p2) == 5 configure_sampling(sample_rate=7, for_all_locations=true) + @test get_sample_rate(p2) == 5 df2 = read_table(p2; metadata_invalid=true) sample(df2) @test get_sample_rate(p2) == 5 diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 82142481..ee214963 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -186,7 +186,7 @@ function RemoteHDF5Destination(remotepath)::Location "path" => remotepath, "subpath" => datasetpath, "path_and_subpath" => path_and_subpath, - "format" => "hdf5" + "format" => "hdf5", ) ) end diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl index 8080db4a..23ac8254 100644 --- a/BanyanHDF5/test/hdf5.jl +++ b/BanyanHDF5/test/hdf5.jl @@ -65,7 +65,7 @@ shuffled in [true, false] invalidate_metadata(p) @test !has_metadata(p) @test has_sample(p) - innvalidate_location(p) + invalidate_location(p) @test !has_metadata(p) @test !has_sample(p) diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl index bdf0ae26..ab982b79 100644 --- a/BanyanImages/test/jpg.jl +++ b/BanyanImages/test/jpg.jl @@ -110,7 +110,7 @@ end invalidate_metadata(p) @test !has_metadata(p) @test has_sample(p) - innvalidate_location(p) + invalidate_location(p) @test !has_metadata(p) @test !has_sample(p) From dfe98d10c2e3f66e59049c09fec3ad73070c9ac1 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Mon, 15 Aug 2022 09:57:19 -0400 Subject: [PATCH 16/25] Add more options for updating cluster --- Banyan/some_file | 1 + Banyan/src/clusters.jl | 17 +++++++++---- Banyan/src/location.jl | 18 +++++++++----- Banyan/src/locations.jl | 29 ++++++++++++++++------ BanyanDataFrames/src/locations.jl | 12 +++++++++ BanyanDataFrames/src/pfs.jl | 26 +++++++++++++++++-- BanyanDataFrames/test/sample_collection.jl | 8 ++++-- 7 files changed, 88 insertions(+), 23 deletions(-) create mode 100644 Banyan/some_file diff --git a/Banyan/some_file b/Banyan/some_file new file mode 100644 index 00000000..3b18e512 --- /dev/null +++ b/Banyan/some_file @@ -0,0 +1 @@ +hello world diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index c96efa47..2e7572c3 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -35,13 +35,14 @@ function create_cluster(; vpc_id = nothing, subnet_id = nothing, nowait=false, + force_create=false, kwargs..., ) # Configure using parameters c = configure(; kwargs...) - clusters = get_clusters(; kwargs...) + clusters = get_clusters(name; kwargs...) if isnothing(name) name = "Cluster " * string(length(clusters) + 1) end @@ -52,11 +53,11 @@ function create_cluster(; # Check if the configuration for this cluster name already exists # If it does, then recreate cluster if haskey(clusters, name) - if clusters[name].status == :terminated + if force_create || clusters[name].status == :terminated @info "Started re-creating cluster named $name" send_request_get_response( :create_cluster, - Dict("cluster_name" => name, "recreate" => true), + Dict("cluster_name" => name, "recreate" => true, "force_create" => true), ) if !nowait wait_for_cluster(name; kwargs...) @@ -139,12 +140,17 @@ function delete_cluster(name::String; kwargs...) ) end -function update_cluster(name::String; nowait=false, kwargs...) +function update_cluster(name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...) configure(; kwargs...) @info "Updating cluster named $name" send_request_get_response( :update_cluster, - Dict{String, Any}("cluster_name" => name) + Dict{String, Any}( + "cluster_name" => name, + "force_update" => force_update, + "update_linux_packages" => update_linux_packages, + "reinstall_julia" => reinstall_julia + ) ) if !nowait wait_for_cluster(name) @@ -189,6 +195,7 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster} if !isempty(cluster_name) filters["cluster_name"] = cluster_name end + @show filters response = send_request_get_response(:describe_clusters, Dict{String,Any}("filters"=>filters)) clusters_dict::Dict{String,Cluster} = Dict{String,Cluster}() for (name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any} diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 1af470c9..a5bdf0b2 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -152,8 +152,12 @@ end has_metadata(p::String=""; kwargs...) = has_metadata(get_location_path_with_format(p; kwargs...)) function has_metadata(l_path:: LocationPath)::Bool - println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path))") + println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())") try + @show propertynames(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))) + @show keys(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))) + @show S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["KeyCount"] + @show S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"] !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"]) catch false @@ -166,8 +170,10 @@ function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) try + @show S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre)) !isempty(S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"]) - catch + catch e + @show e false end end @@ -333,11 +339,11 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} banyan_samples_object_sample_rate = -1 for banyan_samples_object in banyan_samples_objects object_key = banyan_samples_object["Key"] - if startswith(object_key, banyan_samples_object_prefix) + if startswith(object_key, sample_path_prefix) object_sample_rate = parse_sample_rate(object_key) object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) - curr_sample_rate_diff = abs(object_sample_rate - sample_rate) - if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff + curr_sample_rate_diff = abs(object_sample_rate - banyan_samples_object_sample_rate) + if banyan_samples_object_sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff banyan_samples_object_sample_rate = object_sample_rate end end @@ -346,7 +352,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} sample_path_suffix = "$sample_path_prefix$banyan_samples_object_sample_rate" blob = s3("GET", "/$(banyan_samples_bucket_name())/$sample_path_suffix") final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix) - write(final_local_sample_path, blob) + write(final_local_sample_path, seekstart(blob.io)) end # Construct and return LocationSource diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 7e4b8e6e..628c77e5 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -303,6 +303,7 @@ function invalidate_metadata(p; kwargs...) end # Delete from S3 + println("Deleting get_metadata_path(lp)=$(get_metadata_path(lp))") try S3.delete_object(banyan_samples_bucket_name(), get_metadata_path(lp)) catch e @@ -335,11 +336,17 @@ function invalidate_samples(p; kwargs...) end [] end + @show banyan_samples_objects if !isempty(banyan_samples_objects) objects_to_delete = [] for d in banyan_samples_objects push!(objects_to_delete, Dict("Key" => d["Key"])) end + S3.delete_objects( + banyan_samples_bucket_name(), + Dict("objects" => objects_to_delete) + ) + @show objects_to_delete S3.delete_objects( banyan_samples_bucket_name(), Dict("Objects" => objects_to_delete) @@ -350,6 +357,9 @@ function invalidate_location(p; kwargs...) invalidate_metadata(p; kwargs...) invalidate_samples(p; kwargs...) end +function partition(series, partition_size) + (series[i:min(i+(partition_size-1),end)] for i in 1:partition_size:length(series)) +end function invalidate_all_locations() for subdir in ["samples", "metadata"] local_dir = joinpath(homedir(), ".banyan", subdir) @@ -369,20 +379,23 @@ function invalidate_all_locations() end [] end + println("Deleting banyan_samples_objects=$banyan_samples_objects from bucket_name=$bucket_name") if !isempty(banyan_samples_objects) objects_to_delete = [] for d in banyan_samples_objects push!(objects_to_delete, Dict("Key" => d["Key"])) end if !isempty(objects_to_delete) - try - S3.delete_objects( - banyan_samples_bucket_name(), - Dict("Objects" => objects_to_delete) - ) - catch e - if is_debug_on() - show(e) + for objects_to_delete_partition in partition(objects_to_delete, 1000) + try + S3.delete_objects( + bucket_name, + Dict("Objects" => objects_to_delete_partition) + ) + catch e + if is_debug_on() + show(e) + end end end end diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index e1a71efb..a53abb79 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -3,6 +3,12 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end] Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5)) function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::SamplingConfig)::Location + metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") + metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") + haskey(s3_res, "Contents") ? s3_res["Contents"] : [] + end + println("In _remote_table_source at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir") + # Setup for sampling remotepath = lp.path shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact @@ -338,6 +344,12 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: # If a file does not exist, one of the get_metadata/get_sample functions # will error. + metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") + metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") + haskey(s3_res, "Contents") ? s3_res["Contents"] : [] + end + println("In _remote_table_source at end with metadata_dir=$metadata_dir and metadata_bucket_dir=$metadata_bucket_dir and metadata_path=$metadata_path and curr_metadata_invalid=$curr_metadata_invalid") + # Get source parameters src_params = Dict( diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index b324c4ba..9ee72d52 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -388,6 +388,12 @@ function WriteHelper(@nospecialize(format_value)) loc_name::String, loc_params::Dict{String,Any}, ) + metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") + metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") + haskey(s3_res, "Contents") ? s3_res["Contents"] : [] + end + println("In Write at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir") + # Get rid of splitting divisions if they were used to split this data into # groups splitting_divisions = Banyan.get_splitting_divisions() @@ -494,8 +500,10 @@ function WriteHelper(@nospecialize(format_value)) # m_path = is_main ? get_meta_path() : "" # location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : "" # m_path, location_path = sync_across((m_path, location_path), comm=comm) - m_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp_tmp))" - s_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp_tmp))$sample_rate" + m_dir = "s3/$(banyan_metadata_bucket_name())" + s_dir = "s3/$(banyan_samples_bucket_name())" + m_path = "$m_dir/$(get_metadata_path(lp_tmp))" + s_path = "$s_dir/$(get_sample_path_prefix(lp_tmp))$sample_rate" # loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params # Read in meta path if it's there @@ -536,6 +544,12 @@ function WriteHelper(@nospecialize(format_value)) # On the main worker, finalize metadata and location info. sample_invalid = false if is_main + metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") + metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") + haskey(s3_res, "Contents") ? s3_res["Contents"] : [] + end + println("In Write with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir") + # Determine paths and #s of rows for metadata file for worker_i in 1:nworkers push!( @@ -588,9 +602,17 @@ function WriteHelper(@nospecialize(format_value)) end # Determine paths for this batch and gather # of rows + @show m_path + @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b/") + @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") + bucket_dir = readdir("s3/$(banyan_metadata_bucket_name())") + println("On main in $(banyan_metadata_bucket_name()): $bucket_dir") Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows); compress=:zstd, metadata=curr_src_parameters) end + @show readdir("s3/$(banyan_metadata_bucket_name())") + @show Banyan.S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"] + ################################### # Handling Final Batch by Copying # ################################### diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index 72c0ba7f..b4688eb0 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -79,7 +79,7 @@ end end -@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and maximum # of bytes for exact sample" for scheduling_config in +@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for scheduling_config in [ "default scheduling", "parallelism encouraged", @@ -94,7 +94,7 @@ end bucket = get_cluster_s3_bucket_name() - configure_sampling(max_num_bytes=max_num_bytes, always_shuffled=shuffled) + configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled) exact_sample = max_num_bytes > 0 invalidate_all_locations() @@ -104,6 +104,8 @@ end df = read_table(p1; metadata_invalid=true, invalidate_samples=true) sample(df) + @show max_num_bytes + @show exact_sample @show get_sample_rate(p1) configure_sampling(p2; sample_rate=5) @@ -112,6 +114,8 @@ end @show get_sampling_configs() @test get_sample_rate(p2) == 5 @test has_metadata(p2) + sleep(5) + @test has_metadata(p2) @test has_sample(p2) == !exact_sample invalidate_metadata(p2) @test !has_metadata(p2) From 52be36886b8889be6163be56160aac509390fec3 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 16 Aug 2022 06:53:05 -0400 Subject: [PATCH 17/25] Add Arrow import --- Banyan/src/Banyan.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 1cdba8ed..ea4107ed 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -21,7 +21,8 @@ global NOT_USING_MODULES = String["ProfileView", "SnoopCompileCore"] using FilePathsBase: joinpath, isempty using Base: notnothing, env_project_file -using Base64, +using Arrow, + Base64, DataStructures, Dates, Downloads, From 80e23298d888a522d0310b64c8eca84c218b8789 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 16 Aug 2022 07:03:33 -0400 Subject: [PATCH 18/25] Fix bugs --- Banyan/src/location.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index a5bdf0b2..06d3c016 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -231,7 +231,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} if_modified_since_string = "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" try - d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))) + d = get_src_params_dict_from_arrow(seekstart(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))).io)) src_params_not_stored_locally = true d catch e @@ -251,7 +251,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} end else try - d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path)) + d = get_src_params_dict_from_arrow(seekstart(s3("GET", metadata_s3_path).io)) src_params_not_stored_locally = true d catch e @@ -259,7 +259,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} show(e) end if !AWSExceptionInfo(e).not_found - @warn "Assumming metadata isn't copied in the cloud because of following error in attempted access" + @warn "Assuming metadata isn't copied in the cloud because of following error in attempted access" show(e) end Dict{String, String}() @@ -306,7 +306,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix$sample_rate" try blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))) - write(sample_local_path, blob) # This overwrites the existing file + write(sample_local_path, seekstart(blob.io)) # This overwrites the existing file final_local_sample_path = sample_local_path break catch e From 734f8b7f68412eb3bc32c1288dc5e7ec45633c78 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Tue, 16 Aug 2022 08:44:19 -0400 Subject: [PATCH 19/25] Switch to using S3Path where possible --- Banyan/src/Banyan.jl | 1 + Banyan/src/futures.jl | 1 + Banyan/src/location.jl | 173 +++++++++++---------- Banyan/src/locations.jl | 90 ++++------- Banyan/src/samples.jl | 12 +- Banyan/src/utils.jl | 27 +++- BanyanDataFrames/src/df.jl | 3 +- BanyanDataFrames/src/locations.jl | 9 +- BanyanDataFrames/src/pfs.jl | 39 +++-- BanyanDataFrames/test/sample_collection.jl | 16 +- BanyanHDF5/src/locations.jl | 7 +- BanyanImages/src/locations.jl | 7 +- 12 files changed, 206 insertions(+), 179 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index ea4107ed..8bde1ae3 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -42,6 +42,7 @@ AWS.DEFAULT_BACKEND[] = AWS.DownloadsBackend() s3 = set_features(AWS.AWSServices.s3; use_response_type=true) using AWS.AWSExceptions using AWS: @service +# TODO: Remove @service S3 since we just use AWSS3 and s3 @service S3 use_response_type = true @service SQS use_response_type = true using AWSS3 diff --git a/Banyan/src/futures.jl b/Banyan/src/futures.jl index c1e769a1..0274f081 100644 --- a/Banyan/src/futures.jl +++ b/Banyan/src/futures.jl @@ -19,6 +19,7 @@ function create_new_future(source::Location, mutate_from::Future, datatype::Stri sourced(new_future, source) destined(new_future, None()) + # TODO: Add Size location here if needed # Handle locations that have an associated value source_src_name = source.src_name diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 06d3c016..6ec0f39a 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -65,11 +65,8 @@ end function get_sample_path_prefix(lp::LocationPath) format_name_sep = !isempty(lp.format_name) ? "_" : "" - format_version_sep = !isempty(lp.format_version) ? "_" : "" - lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version * format_version_sep + lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version end -get_sample_path(lp::LocationPath, sample_rate::Int64) = - get_sample_path_prefix(lp) * string(sample_rate) get_metadata_path(lp::LocationPath) = lp.path_hash banyan_samples_bucket_name() = "banyan-samples-$(get_organization_id())" banyan_metadata_bucket_name() = "banyan-metadata-$(get_organization_id())" @@ -103,45 +100,37 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig = get_sample_rate(p::String=""; kwargs...) = get_sample_rate(get_location_path_with_format(p; kwargs...)) function parse_sample_rate(object_key) - lastpos = findlast("_", object_key) - if isnothing(lastpos) - error("Object name \"$object_key\" doesn't contain a sample rate") - end - parse(Int64, object_key[(lastpos.start+1):end]) + parse(Int64, last(splitpath(object_key))) end function get_sample_rate(l_path::LocationPath) + sc = get_sampling_config(l_path) + @show sc + # Get the desired sample rate - desired_sample_rate = get_sampling_config(l_path).rate + desired_sample_rate = sc.rate # If we just want the default sample rate or if a new sample rate is being # forced, then just return that. if isempty(l_path.path) return desired_sample_rate end - sc = get_sampling_config(l_path) if sc.force_new_sample_rate return desired_sample_rate end # Find a cached sample with a similar sample rate - pre = get_sample_path_prefix(l_path) - banyan_samples_objects = try - res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"] - res isa Base.Vector ? res : [res] - catch e - @show e - return desired_sample_rate - end + banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())") + banyan_samples_object_dir = joinpath(banyan_samples_bucket, get_sample_path_prefix(l_path)) sample_rate = -1 - for banyan_samples_object in banyan_samples_objects - object_key = banyan_samples_object["Key"] - if startswith(object_key, pre) - object_sample_rate = parse_sample_rate(object_key) - object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) - curr_sample_rate_diff = abs(object_sample_rate - sample_rate) - if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff - sample_rate = object_sample_rate - end + @show banyan_samples_object_dir + @show readdir(banyan_samples_bucket) + @show readdir_no_error(banyan_samples_object_dir) + for object_key in readdir_no_error(banyan_samples_object_dir) + object_sample_rate = parse(Int64, object_key) + object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) + curr_sample_rate_diff = abs(sample_rate - desired_sample_rate) + if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff + sample_rate = object_sample_rate end end sample_rate != -1 ? sample_rate : desired_sample_rate @@ -153,28 +142,32 @@ has_metadata(p::String=""; kwargs...) = has_metadata(get_location_path_with_format(p; kwargs...)) function has_metadata(l_path:: LocationPath)::Bool println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())") - try - @show propertynames(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))) - @show keys(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))) - @show S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["KeyCount"] - @show S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"] - !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"]) - catch - false - end + isfile(S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(l_path))")) end has_sample(p::String=""; kwargs...) = has_sample(get_location_path_with_format(p; kwargs...)) function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) - pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path) - try - @show S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre)) - !isempty(S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"]) - catch e - @show e - false + banyan_sample_dir = S3Path("s3://$(banyan_samples_bucket_name())/$(get_sample_path_prefix(l_path))") + println("In has_sample") + @show sc + @show sc.force_new_sample_rate + @show joinpath(banyan_sample_dir, string(sc.rate)) + @show isdir_no_error(banyan_sample_dir) + @show isdir_no_error(banyan_sample_dir) && !isempty(readdir(banyan_sample_dir)) + @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/")) + @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2")) + @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2/")) + @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2")) + @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arr/")) + @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_200/")) + @show banyan_sample_dir + @show readdir_no_error(banyan_sample_dir) + if sc.force_new_sample_rate + isfile(joinpath(banyan_sample_dir, string(sc.rate))) + else + !isempty(readdir_no_error(banyan_sample_dir)) end end @@ -266,13 +259,13 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} end end # Store metadata locally - if src_params_not_stored_locally && !isempty(d) + if src_params_not_stored_locally && !isempty(src_params) Arrow.write(metadata_local_path, Arrow.Table(); metadata=src_params) end # Load in sample - sc = get_sampling_config() + sc = get_sampling_config(lp) force_new_sample_rate = sc.force_new_sample_rate desired_sample_rate = sc.rate sample_path_prefix = get_sample_path_prefix(lp) @@ -280,16 +273,16 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} # Find local samples found_local_samples = Tuple{String,Int64}[] found_local_sample_rate_diffs = Int64[] - samples_local_dir = get_samples_local_path() - local_sample_paths = isdir(samples_local_dir) ? readdir(samples_local_dir, join=true) : String[] - for local_sample_path in local_sample_paths - if startswith(local_sample_path, sample_path_prefix) - local_sample_rate = parse_sample_rate(object_key) - diff_sample_rate = abs(local_sample_rate - desired_sample_rate) - if !force_new_sample_rate || sample_rate_diff == 0 - push!(found_local_samples, (local_sample_path, local_sample_rate)) - push!(found_local_sample_rate_diffs, diff_sample_rate) - end + sample_local_dir = joinpath(get_samples_local_path(), sample_path_prefix) + mkpath(sample_local_dir) + local_sample_paths = isdir(sample_local_dir) ? readdir(sample_local_dir) : String[] + for local_sample_path_suffix in local_sample_paths + local_sample_path = joinpath(sample_local_dir, local_sample_path_suffix) + local_sample_rate = parse(Int64, local_sample_path_suffix) + diff_sample_rate = abs(local_sample_rate - desired_sample_rate) + if !force_new_sample_rate || diff_sample_rate == 0 + push!(found_local_samples, (local_sample_path, local_sample_rate)) + push!(found_local_sample_rate_diffs, diff_sample_rate) end end @@ -297,17 +290,26 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} # rate closest to the desired sample rate) found_local_samples = found_local_samples[sortperm(found_local_sample_rate_diffs)] - # Find a local sample that is up-to-date + # Find a local sample that is up-to-date. NOTE: The data itself might have + # changed in which case the cached samples are out-of-date and we don't + # currently capture that. This doesn't even check if there is a more recent + # sample of a different sample rate (although that is kind of a bug/limitation + # that could be resolved though the best way to resolve it would be by + # comparing to the last modified date for the data itself). It just checks that the remote sample + # hasn't been manually invalidated by the user or a Banyan writing function + # and that there isn't a newer sample for this specific sample rate. final_local_sample_path = "" + final_sample_rate = -1 for (sample_local_path, sample_rate) in found_local_samples lm = Dates.unix2datetime(mtime(sample_local_path)) if_modified_since_string = "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" - sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix$sample_rate" + sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix/$sample_rate" try blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))) write(sample_local_path, seekstart(blob.io)) # This overwrites the existing file final_local_sample_path = sample_local_path + final_sample_rate = sample_rate break catch e if is_debug_on() @@ -318,6 +320,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} @warn "Assumming locally stored metadata is invalid because it is not backed up to the cloud" elseif ei.unmodified_since final_local_sample_path = sample_local_path + final_sample_rate = sample_rate break else @warn "Assumming locally stored metadata is invalid because of following error in accessing the metadata copy in the cloud" @@ -327,32 +330,29 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} end # If no such sample is found, search the S3 bucket - banyan_samples_objects = try - res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"] - res isa Base.Vector ? res : [res] - catch e - if is_debug_on() - show(e) - end - [] - end - banyan_samples_object_sample_rate = -1 - for banyan_samples_object in banyan_samples_objects - object_key = banyan_samples_object["Key"] - if startswith(object_key, sample_path_prefix) - object_sample_rate = parse_sample_rate(object_key) + if isempty(final_local_sample_path) + banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())") + final_sample_rate = -1 + banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix) + for object_key in readdir_no_error(banyan_samples_object_dir) + object_sample_rate = parse(Int64, object_key) object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) - curr_sample_rate_diff = abs(object_sample_rate - banyan_samples_object_sample_rate) - if banyan_samples_object_sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff - banyan_samples_object_sample_rate = object_sample_rate + curr_sample_rate_diff = abs(final_sample_rate - desired_sample_rate) + if force_new_sample_rate ? (object_sample_rate_diff == 0) : (final_sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff) + final_sample_rate = object_sample_rate + final_local_sample_path = joinpath(sample_local_dir, object_key) end end - end - if banyan_samples_object_sample_rate != -1 - sample_path_suffix = "$sample_path_prefix$banyan_samples_object_sample_rate" - blob = s3("GET", "/$(banyan_samples_bucket_name())/$sample_path_suffix") - final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix) - write(final_local_sample_path, seekstart(blob.io)) + if final_sample_rate != -1 + cp( + joinpath( + banyan_samples_bucket, + sample_path_prefix, + string(final_sample_rate) + ), + Path(final_local_sample_path) + ) + end end # Construct and return LocationSource @@ -364,13 +364,14 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} ) res_location.metadata_invalid = isempty(src_params) res_location.sample_invalid = isempty(final_local_sample_path) + @show final_sample_rate + @show final_local_sample_path + final_sample_rate = isempty(final_local_sample_path) ? desired_sample_rate : final_sample_rate + @show desired_sample_rate + @show sample_local_dir ( res_location, metadata_local_path, - if !isempty(final_local_sample_path) - final_local_sample_path - else - joinpath(samples_local_dir, "$sample_path_prefix$desired_sample_rate") - end + joinpath(sample_local_dir, string(final_sample_rate)) ) end \ No newline at end of file diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 628c77e5..d7decd31 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -304,12 +304,9 @@ function invalidate_metadata(p; kwargs...) # Delete from S3 println("Deleting get_metadata_path(lp)=$(get_metadata_path(lp))") - try - S3.delete_object(banyan_samples_bucket_name(), get_metadata_path(lp)) - catch e - if is_debug_on() - show(e) - end + s3p = S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))") + if isfile(s3p) + rm(s3p) end end function invalidate_samples(p; kwargs...) @@ -327,31 +324,17 @@ function invalidate_samples(p; kwargs...) end # Delete from S3 - banyan_samples_objects = try - res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"] - res isa Base.Vector ? res : [res] - catch e - if is_debug_on() - show(e) - end - [] - end - @show banyan_samples_objects - if !isempty(banyan_samples_objects) - objects_to_delete = [] - for d in banyan_samples_objects - push!(objects_to_delete, Dict("Key" => d["Key"])) - end - S3.delete_objects( - banyan_samples_bucket_name(), - Dict("objects" => objects_to_delete) - ) - @show objects_to_delete - S3.delete_objects( - banyan_samples_bucket_name(), - Dict("Objects" => objects_to_delete) - ) + s3p = S3Path("s3://$(banyan_samples_bucket_name())/$sample_path_prefix") + @show readdir_no_error(s3p) + @show s3p + @show path_as_dir(s3p) + @show readdir(S3Path("s3://$(banyan_samples_bucket_name())")) + if !isempty(readdir_no_error(s3p)) + rm(path_as_dir(s3p), recursive=true) end + @show readdir_no_error(s3p) + @show s3p + @show readdir(S3Path("s3://$(banyan_samples_bucket_name())")) end function invalidate_location(p; kwargs...) invalidate_metadata(p; kwargs...) @@ -370,34 +353,10 @@ function invalidate_all_locations() # Delete from S3 for bucket_name in [banyan_samples_bucket_name(), banyan_metadata_bucket_name()] - banyan_samples_objects = try - res = S3.list_objects_v2(bucket_name)["Contents"] - res isa Base.Vector ? res : [res] - catch e - if is_debug_on() - show(e) - end - [] - end - println("Deleting banyan_samples_objects=$banyan_samples_objects from bucket_name=$bucket_name") - if !isempty(banyan_samples_objects) - objects_to_delete = [] - for d in banyan_samples_objects - push!(objects_to_delete, Dict("Key" => d["Key"])) - end - if !isempty(objects_to_delete) - for objects_to_delete_partition in partition(objects_to_delete, 1000) - try - S3.delete_objects( - bucket_name, - Dict("Objects" => objects_to_delete_partition) - ) - catch e - if is_debug_on() - show(e) - end - end - end + s3p = S3Path("s3://$bucket_name") + if isdir_no_error(s3p) + for p in readdir(s3p, join=true) + rm(p, force=true, recursive=true) end end end @@ -476,24 +435,28 @@ function RemoteSource( # Look at local and S3 caches of metadata and samples to attempt to # construct a Location. loc, local_metadata_path, local_sample_path = get_location_source(lp) - sc = deepcopy(get_sampling_config(lp)) - sc.rate = parse_sample_rate(local_sample_path) + @show lp + @show get_sampling_configs() + @show local_sample_path - if !loc.metadata_invalid && !loc.sample_invalid + res = if !loc.metadata_invalid && !loc.sample_invalid # Case where both sample and parameters are valid loc.sample.value = load_sample(local_sample_path) + loc.sample.rate = parse_sample_rate(local_sample_path) loc elseif loc.metadata_invalid && !loc.sample_invalid # Case where parameters are invalid - new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true) + new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true) Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) + @show new_loc new_loc.sample.value = load_sample(local_sample_path) new_loc else # Case where sample is invalid # Get the Location with up-to-date metadata (source parameters) and sample - new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true) + new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true) + @show new_loc if !loc.metadata_invalid # Store the metadata locally. The local copy just has the source @@ -508,4 +471,5 @@ function RemoteSource( new_loc end + res end \ No newline at end of file diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 47d3a8fd..66eb51af 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -11,13 +11,13 @@ function configure_sampling( ) global session_sampling_configs - sc = get_sampling_config(path; kwargs...) + sc = default ? DEFAULT_SAMPLING_CONFIG : get_sampling_config(path; kwargs...) nsc = SamplingConfig( - (!isnothing(sample_rate) && !default) ? sample_rate : sc.rate, - (!isnothing(always_exact) && !default) ? always_exact : sc.always_exact, - (!isnothing(max_num_bytes_exact) && !default) ? max_num_bytes_exact : sc.max_num_bytes_exact, - (!isnothing(force_new_sample_rate) && !default) ? force_new_sample_rate : sc.force_new_sample_rate, - (!isnothing(assume_shuffled) && !default) ? assume_shuffled : sc.assume_shuffled, + (!isnothing(sample_rate)) ? sample_rate : sc.rate, + (!isnothing(always_exact)) ? always_exact : sc.always_exact, + (!isnothing(max_num_bytes_exact)) ? max_num_bytes_exact : sc.max_num_bytes_exact, + (!isnothing(force_new_sample_rate)) ? force_new_sample_rate : sc.force_new_sample_rate, + (!isnothing(assume_shuffled)) ? assume_shuffled : sc.assume_shuffled, ) session_id = _get_session_id_no_error() diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index cec3e57c..bcc38d90 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -640,4 +640,29 @@ size_from_str(s) = res[i] = parse(Int64, sz_str) end Tuple(res) - end \ No newline at end of file + end + +function isdir_no_error(p) + try + isdir(p) + catch e + if is_debug_on() + print("Failed to check isdir because of e=$e") + end + false + end +end +function path_as_dir(p) + p_sep = p.separator + endswith(string(p), p_sep) ? p : (p * p_sep) +end +function readdir_no_error(p) + try + readdir(path_as_dir(p)) + catch e + if is_debug_on() + print("Failed to readdir of p=$p because of e=$e") + end + String[] + end +end \ No newline at end of file diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl index 3c604963..64654773 100644 --- a/BanyanDataFrames/src/df.jl +++ b/BanyanDataFrames/src/df.jl @@ -55,7 +55,8 @@ function read_table(path::String; kwargs...) invalidate(path; after=true, kwargs...) df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"]) df_nrows = Future(df_loc_nrows) - DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows) + res = DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows) + res end # TODO: For writing functions, if a file is specified, enforce Replicated diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index a53abb79..3581365e 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -2,7 +2,8 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end] Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5)) -function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::SamplingConfig)::Location +function _remote_table_source(lp::LocationPath, loc::Location)::Location + sampling_config = get_sampling_config(lp) metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") haskey(s3_res, "Contents") ? s3_res["Contents"] : [] @@ -28,7 +29,11 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config:: # Get paths for writing sample and metadata metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" - sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate" + sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))" + mkpath(sample_dir) + sample_path = "$sample_dir/$sample_rate" + @show sample_path + @show sample_rate # Get metadata if it is still valid curr_meta::Arrow.Table = if !curr_metadata_invalid diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index 9ee72d52..c1ec11be 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -495,15 +495,16 @@ function WriteHelper(@nospecialize(format_value)) sample_rate = sampling_config.rate # Get paths for reading in metadata and Location - tmp_suffix = nbatches > 1 ? ".tmp" : "" - lp_tmp = LocationPath(loc_params_path * tmp_suffix, "arrow", "2") + lp_tmp = LocationPath(path, "arrow", "2") # m_path = is_main ? get_meta_path() : "" # location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : "" # m_path, location_path = sync_across((m_path, location_path), comm=comm) m_dir = "s3/$(banyan_metadata_bucket_name())" s_dir = "s3/$(banyan_samples_bucket_name())" m_path = "$m_dir/$(get_metadata_path(lp_tmp))" - s_path = "$s_dir/$(get_sample_path_prefix(lp_tmp))$sample_rate" + s_sample_dir = "$s_dir/$(get_sample_path_prefix(lp_tmp))" + mkpath(s_sample_dir) + s_path = "$s_sample_dir/$sample_rate" # loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params # Read in meta path if it's there @@ -582,7 +583,7 @@ function WriteHelper(@nospecialize(format_value)) curr_src_parameters["nrows"] = string(total_nrows) curr_src_parameters["sample_memory_usage"] = string(sample_memory_usage) - if !is_disk && batch_idx == nbatches && sample_memory_usage <= sampling_config.max_num_bytes_exact + if is_disk || sample_memory_usage <= sampling_config.max_num_bytes_exact # If the total # of rows turns out to be inexact then we can simply mark it as # stale so that it can be collected more efficiently later on # We should be able to quickly recompute a more useful sample later @@ -590,15 +591,25 @@ function WriteHelper(@nospecialize(format_value)) sample_invalid = true end - println("In Write with sample_invalid=$sample_invalid and sample_memory_usage=$sample_memory_usage while sampling_config=$sampling_config, writing to $m_path") + println("In Write with sample_invalid=$sample_invalid (because sample_memory_usage=$sample_memory_usage and sampling_config.max_num_bytes_exact=$(sampling_config.max_num_bytes_exact)) and while sampling_config=$sampling_config, writing to $m_path and $s_path, on batch_idx=$batch_idx with curr_src_parameters=$curr_src_parameters") + + @show get_sampling_configs() + @show lp + @show get_sampling_config(lp) + @show s_path + @show s_sample_dir # Get the actual sample by concatenating - if !is_disk && !sample_invalid + if !sample_invalid sampled_parts = [gathered[4] for gathered in gathered_data] if batch_idx > 1 - push!(sampled_parts, curr_location.sample.value |> seekstart |> Arrow.Table |> DataFrames.DataFrame) + push!(sampled_parts, Arrow.Table(s_path) |> DataFrames.DataFrame) end + println("Writing to s_path=$s_path") Arrow.write(s_path, vcat(sampled_parts...), compress=:zstd) + else + println("Removing s_path=$s_path") + rm(s_path, force=true, recursive=true) end # Determine paths for this batch and gather # of rows @@ -613,6 +624,9 @@ function WriteHelper(@nospecialize(format_value)) @show readdir("s3/$(banyan_metadata_bucket_name())") @show Banyan.S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"] + println("In Write") + @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/")) + ################################### # Handling Final Batch by Copying # ################################### @@ -620,10 +634,15 @@ function WriteHelper(@nospecialize(format_value)) if nbatches > 1 && batch_idx == nbatches # Copy over location and meta path actual_meta_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" - actual_sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate" - if worker_idx == 1 + actual_sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))" + actual_sample_path = "$actual_sample_dir/$sample_rate" + if is_main cp(m_path, actual_meta_path, force=true) - cp(s_path, actual_sample_path, force=true) + if !sample_invalid + mkpath(actual_sample_dir) + println("Copying from s_path=$s_path to actual_sample_path=$actual_sample_path") + cp(s_path, actual_sample_path, force=true) + end end # Copy over files to actual location diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index b4688eb0..e9aec975 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -79,14 +79,14 @@ end end -@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for scheduling_config in +@testset "Reading/writing $(shuffled ? "shuffled " : "")$format data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for scheduling_config in [ "default scheduling", "parallelism encouraged", "parallelism and batches encouraged", ], format in ["csv", "parquet"], - max_num_bytes in [0, Banyan.parse_bytes("100 GB")], + max_num_bytes in [0, 100_000_000_000], shuffled in [true, false] use_session_for_testing(scheduling_config_name = scheduling_config) do @@ -94,13 +94,14 @@ end bucket = get_cluster_s3_bucket_name() - configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled) + configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled, for_all_locations=true, default=true) exact_sample = max_num_bytes > 0 invalidate_all_locations() p1 = "s3://$(bucket)/iris_large.$format" p2 = "s3://$(bucket)/iris_large_tmp.$format" + println("has_sample(p2)=$(has_sample(p2)) after invalidation") df = read_table(p1; metadata_invalid=true, invalidate_samples=true) sample(df) @@ -109,13 +110,12 @@ end @show get_sample_rate(p1) configure_sampling(p2; sample_rate=5) + println("Before write_table") @show get_sampling_configs() write_table(df, p2) @show get_sampling_configs() @test get_sample_rate(p2) == 5 @test has_metadata(p2) - sleep(5) - @test has_metadata(p2) @test has_sample(p2) == !exact_sample invalidate_metadata(p2) @test !has_metadata(p2) @@ -135,14 +135,18 @@ end df2 = read_table(p2; samples_invalid=true) sample(df2) @test get_sample_rate(p2) == 5 + println("After bad get_sample_rate") configure_sampling(sample_rate=7, for_all_locations=true) @test get_sample_rate(p2) == 5 + println("After bad get_sample_rate") df2 = read_table(p2; metadata_invalid=true) sample(df2) @test get_sample_rate(p2) == 5 @test get_sample_rate() == 7 - configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) + configure_sampling(sample_rate=7, for_all_locations=true) @test get_sample_rate(p2) == 5 + configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) + @test get_sample_rate(p2) == 7 @test get_sample_rate() == 7 df2 = read_table(p2) @test get_sample_rate(p2) == 7 diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index ee214963..585da17c 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -26,7 +26,8 @@ end HDF5_getindex_retry = retry(HDF5.getindex; delays=Base.ExponentialBackOff(; n=5)) -function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig) +function _remote_hdf5_source(lp::LocationPath, loc::Location) + sc = get_sampling_config(lp) path_and_subpath = lp.path shuffled = sc.assume_shuffled curr_metadata_invalid = loc.metadata_invalid @@ -154,7 +155,9 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig # Get paths to store metadata and sample in metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" - sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)" + sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))" + mkpath(sample_dir) + sample_path = "$sample_dir/$sample_rate" # Store metadata and sample in S3 Arrow.write(metadata_path, Arrow.Table(); metadata=src_params) diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index 783b858f..3bd07e40 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -272,7 +272,8 @@ _load_image_and_add_channelview(path_on_worker::String) = load_retry(path_on_wor _reshape_image(image) = reshape(image, (1, size(image)...)) -function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfig, remotepath, add_channelview::Bool) +function _remote_image_source(lp::LocationPath, loc::Location, remotepath, add_channelview::Bool) + sc = get_sampling_config(lp) curr_sample_invalid = loc.sample_invalid curr_metadata_invalid = loc.metadata_invalid @@ -296,7 +297,9 @@ function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfi # Get paths to store metadata and sample in metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))" - sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$(sc.rate))" + sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))" + mkpath(sample_dir) + sample_path = "$sample_dir/$(sc.rate)" # Iterable object that iterates over local paths localpaths = curr_metadata_invalid ? getpaths(remotepath) : Arrow.Table(metadata_path).path From 498890a19e455659cf9855da02843c585322ee84 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Wed, 17 Aug 2022 11:41:51 -0400 Subject: [PATCH 20/25] Fix new sampling system for HDF5 --- Banyan/src/Banyan.jl | 2 +- Banyan/src/location.jl | 92 ++++++++++++------- Banyan/src/locations.jl | 30 ++++-- Banyan/src/queues.jl | 33 ++++--- Banyan/src/requests.jl | 30 ++++-- Banyan/src/samples.jl | 2 +- Banyan/src/utils.jl | 10 +- Banyan/src/utils_pfs.jl | 2 +- BanyanDataFrames/src/locations.jl | 4 +- BanyanDataFrames/src/precompile.jl | 5 +- BanyanDataFrames/test/sample_collection.jl | 3 +- BanyanHDF5/Project.toml | 2 + BanyanHDF5/src/BanyanHDF5.jl | 3 +- BanyanHDF5/src/hdf5.jl | 1 - BanyanHDF5/src/locations.jl | 15 +-- BanyanHDF5/test/hdf5.jl | 70 ++++++++------ BanyanImages/Project.toml | 1 + BanyanImages/src/BanyanImages.jl | 2 +- BanyanImages/src/image.jl | 6 +- BanyanImages/src/locations.jl | 14 +-- BanyanImages/test/jpg.jl | 101 +++++++++++++-------- BanyanImages/test/runtests.jl | 2 +- 22 files changed, 270 insertions(+), 160 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 8bde1ae3..da4a6c94 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -50,7 +50,7 @@ using AWSS3 global BANYAN_API_ENDPOINT # Account management -export configure +export configure, get_organization_id # Cluster management export Cluster, diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 6ec0f39a..4b9458d2 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -14,6 +14,8 @@ mutable struct Location sample_invalid::Bool end +LOCATION_PATH_KWARG_NAMES = ["add_channelview"] + struct LocationPath original_path::String path::String @@ -22,47 +24,61 @@ struct LocationPath format_name::String format_version::String - function LocationPath(path, format_name, format_version) + function LocationPath(path::Any, format_name::String, format_version::String; kwargs...) + LocationPath("lang_jl_$(hash(path))", format_name, format_version; kwargs...) + end + function LocationPath(path::String, format_name::String, format_version::String; kwargs...) # This function is responsible for "normalizing" the path. # If there are multiple path strings that are technically equivalent, # this function should map them to the same string. - path_hash = hash(path) + + # Add the kwargs to the path + path_res = deepcopy(path) + for (kwarg_name, kwarg_value) in kwargs + if kwarg_name in LOCATION_PATH_KWARG_NAMES + path_res *= "_$kwarg_name=$kwarg_value" + end + end + + # Return the LocationPath + path_hash = hash(path_res) new( - path, - path, + path_res, + path_res, path_hash, string(path_hash), format_name, format_version ) end + + LocationPath(p; kwargs...) = LocationPath("lang_jl_$(hash(path))"; kwargs...) + function LocationPath(p::String; kwargs...)::LocationPath + if isempty(p) + return NO_LOCATION_PATH + end + + format_name = get(kwargs, :format, "jl") + is_sample_format_arrow = format_name == "arrow" + if is_sample_format_arrow + return LocationPath(p, "arrow", get(kwargs, :format_version, "2"); kwargs...) + else + for table_format in TABLE_FORMATS + if occursin(table_format, p) || format_name == p + return LocationPath(p, "arrow", "2"; kwargs...) + end + end + end + LocationPath(p, "jl", get_julia_version(); kwargs...) + end - LocationPath(path) = LocationPath(path, "jl", get_julia_version())`` + # TODO: Maybe make end # Functions with `LocationPath`s` global TABLE_FORMATS = ["csv", "parquet", "arrow"] -function get_location_path_with_format(p::String; kwargs...)::LocationPath - if isempty(p) - return NO_LOCATION_PATH - end - - format_name = get(kwargs, :format, "jl") - is_sample_format_arrow = format_name == "arrow" - if is_sample_format_arrow - return LocationPath(p, "arrow", get(kwargs, :format_version, "2")) - else - for table_format in TABLE_FORMATS - if occursin(table_format, p) || format_name == p - return LocationPath(p, "arrow", "2") - end - end - end - LocationPath(p, "jl", get_julia_version()) -end - function get_sample_path_prefix(lp::LocationPath) format_name_sep = !isempty(lp.format_name) ? "_" : "" lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version @@ -85,7 +101,7 @@ function set_sampling_configs(d::Dict{LocationPath,SamplingConfig}) session_sampling_configs[_get_session_id_no_error()] = d end -get_sampling_config(path=""; kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...)) +get_sampling_config(path=""; kwargs...) = get_sampling_config(LocationPath(path; kwargs...)) function get_sampling_configs() global session_sampling_configs session_sampling_configs[_get_session_id_no_error()] @@ -97,8 +113,8 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig = # Getting sample rate -get_sample_rate(p::String=""; kwargs...) = - get_sample_rate(get_location_path_with_format(p; kwargs...)) +get_sample_rate(p=""; kwargs...) = + get_sample_rate(LocationPath(p; kwargs...)) function parse_sample_rate(object_key) parse(Int64, last(splitpath(object_key))) end @@ -138,15 +154,15 @@ end # Checking for having metadata, samples -has_metadata(p::String=""; kwargs...) = - has_metadata(get_location_path_with_format(p; kwargs...)) +has_metadata(p=""; kwargs...) = + has_metadata(LocationPath(p; kwargs...)) function has_metadata(l_path:: LocationPath)::Bool println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())") isfile(S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(l_path))")) end -has_sample(p::String=""; kwargs...) = - has_sample(get_location_path_with_format(p; kwargs...)) +has_sample(p=""; kwargs...) = + has_sample(LocationPath(p; kwargs...)) function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) banyan_sample_dir = S3Path("s3://$(banyan_samples_bucket_name())/$(get_sample_path_prefix(l_path))") @@ -200,7 +216,7 @@ function get_metadata_local_path() end function get_samples_local_path() - p = joinpath(homedir(), ".banyan", "metadata") + p = joinpath(homedir(), ".banyan", "samples") if !isdir(p) mkpath(p) end @@ -306,6 +322,8 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix/$sample_rate" try + @show sample_local_path + @show sample_s3_path blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))) write(sample_local_path, seekstart(blob.io)) # This overwrites the existing file final_local_sample_path = sample_local_path @@ -330,10 +348,11 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} end # If no such sample is found, search the S3 bucket + banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())") + banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix) if isempty(final_local_sample_path) - banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())") final_sample_rate = -1 - banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix) + @show readdir_no_error(banyan_samples_object_dir) for object_key in readdir_no_error(banyan_samples_object_dir) object_sample_rate = parse(Int64, object_key) object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) @@ -353,6 +372,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} Path(final_local_sample_path) ) end + @show readdir_no_error(banyan_samples_object_dir) end # Construct and return LocationSource @@ -364,11 +384,15 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} ) res_location.metadata_invalid = isempty(src_params) res_location.sample_invalid = isempty(final_local_sample_path) + @show res_location @show final_sample_rate @show final_local_sample_path final_sample_rate = isempty(final_local_sample_path) ? desired_sample_rate : final_sample_rate @show desired_sample_rate @show sample_local_dir + @show readdir(sample_local_dir) + println("At end of get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") + ( res_location, metadata_local_path, diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index d7decd31..0a3e3525 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -294,7 +294,7 @@ getsamplenrows(totalnrows::Int64)::Int64 = begin # eventually stored and updated in S3 on each write. function invalidate_metadata(p; kwargs...) - lp = get_location_path_with_format(p; kwargs...) + lp = LocationPath(p; kwargs...) # Delete locally p = joinpath(homedir(), ".banyan", "metadata", get_metadata_path(lp)) @@ -310,7 +310,7 @@ function invalidate_metadata(p; kwargs...) end end function invalidate_samples(p; kwargs...) - lp = get_location_path_with_format(p; kwargs...) + lp = LocationPath(p; kwargs...) # Delete locally samples_local_dir = joinpath(homedir(), ".banyan", "samples") @@ -344,11 +344,8 @@ function partition(series, partition_size) (series[i:min(i+(partition_size-1),end)] for i in 1:partition_size:length(series)) end function invalidate_all_locations() - for subdir in ["samples", "metadata"] - local_dir = joinpath(homedir(), ".banyan", subdir) - if isdir(local_dir) - rm(local_dir; force=true, recursive=true) - end + for local_dir in [get_samples_local_path(), get_metadata_local_path()] + rm(local_dir; force=true, recursive=true) end # Delete from S3 @@ -435,9 +432,13 @@ function RemoteSource( # Look at local and S3 caches of metadata and samples to attempt to # construct a Location. loc, local_metadata_path, local_sample_path = get_location_source(lp) + let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") + println("Before get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir)) and loc.metadata_invalid=$(loc.metadata_invalid) and loc.sample_invalid=$(loc.sample_invalid)") + end @show lp @show get_sampling_configs() @show local_sample_path + @show loc res = if !loc.metadata_invalid && !loc.sample_invalid # Case where both sample and parameters are valid @@ -446,7 +447,19 @@ function RemoteSource( loc elseif loc.metadata_invalid && !loc.sample_invalid # Case where parameters are invalid + let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") + println("Before offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") + end + let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b") + println("Before offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))") + end new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true) + let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") + println("After offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") + end + let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b") + println("After offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))") + end Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) @show new_loc new_loc.sample.value = load_sample(local_sample_path) @@ -471,5 +484,8 @@ function RemoteSource( new_loc end + let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") + println("At end of RemoteSource with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") + end res end \ No newline at end of file diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index 466ff88c..df6ba120 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -32,6 +32,8 @@ function get_next_message( end end m_dict = m["ReceiveMessageResult"]["Message"] + @show m_dict["MessageId"] + @show m_dict["ReceiptHandle"] if delete SQS.delete_message(queue_url, m_dict["ReceiptHandle"]::String) end @@ -148,24 +150,31 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) end end + for (i, pm) in enumerate(message_ranges) + if i > 1 + println("pm == partial_messages[i-1] = $(message[pm] == message[message_ranges[i-1]])") + end + end + # Launch asynchronous threads to send SQS messages gather_q_url = gather_queue_url() num_chunks = length(message_ranges) @show num_chunks if num_chunks > 1 - @sync for i = 1:message_ranges + @sync for i = 1:num_chunks @async begin - msg = Dict{String,Any}( - "kind" => "GATHER", - "value_id" => value_id, - "contents" => message[message_ranges[i]], - "worker_memory_used" => worker_memory_used, - "chunk_idx" => i, - "num_chunks" => num_chunks - ) - msg_json = JSON.json(msg) SQS.send_message( - msg_json, + JSON.json( + Dict{String,Any}( + "kind" => "GATHER", + "value_id" => value_id, + "contents" => message[message_ranges[i]], + "contents_length" => length(message[message_ranges[i]]), + "worker_memory_used" => worker_memory_used, + "chunk_idx" => i, + "num_chunks" => num_chunks + ) + ), gather_q_url, Dict( "MessageGroupId" => string(i), @@ -173,6 +182,8 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) ) ) @show i + @show message_ranges[i] + @show length(message[message_ranges[i]]) end end else diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 32d67630..66720fa9 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -302,9 +302,10 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) chunk_idx = partial_message["chunk_idx"] @show chunk_idx - partial_messages[chunk_idx] = message["contents"] + partial_messages[chunk_idx] = partial_message["contents"] end end + @show length.(partial_messages) join(partial_messages) else message["contents"] @@ -719,14 +720,29 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) @show num_chunks whole_message_contents = if num_chunks > 1 - partial_messages = Vector{String}(undef, num_chunks) + partial_messages = fill("", num_chunks) partial_messages[message["chunk_idx"]] = message["contents"] - @sync for i = 1:num_remaining_chunks + @show message["chunk_idx"] + @sync for _ = 1:num_remaining_chunks @async begin - partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) - chunk_idx = partial_message["chunk_idx"] - @show chunk_idx - partial_messages[chunk_idx] = message["contents"] + let partial_message = sqs_receive_next_message(gather_queue, p, nothing, nothing)[1] + chunk_idx = partial_message["chunk_idx"] + partial_messages[chunk_idx] = partial_message["contents"] + @show chunk_idx + @show length(partial_message["contents"]) + @show partial_message["contents_length"] + @show length(partial_messages[chunk_idx]) + @show last(partial_message["contents"], 20) + @show last(partial_messages[chunk_idx], 20) + @show length.(partial_messages) + end + end + end + # TODO: Fix this so that it gets the partial messages which are different lengths + @show length.(partial_messages) + for (i, pm) in enumerate(partial_messages) + if i > 1 + println("pm == partial_messages[i-1] = $(pm == partial_messages[i-1])") end end join(partial_messages) diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 66eb51af..2ce34517 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -21,7 +21,7 @@ function configure_sampling( ) session_id = _get_session_id_no_error() - lp = get_location_path_with_format(path; kwargs...) + lp = LocationPath(path; kwargs...) sampling_configs = session_sampling_configs[session_id] if for_all_locations empty!(sampling_configs) diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index bcc38d90..ac93c999 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -610,6 +610,8 @@ TYPE_TO_STR = STR_TO_TYPE = invert(TYPE_TO_STR) function type_to_str(ty::DataType)::String + @show ty + @show TYPE_TO_STR global TYPE_TO_STR if haskey(TYPE_TO_STR, ty) TYPE_TO_STR[ty] @@ -619,14 +621,16 @@ function type_to_str(ty::DataType)::String end function type_from_str(s::String) + @show s + @show STR_TO_TYPE if startswith(s, "lang_") if startswith(s, "lang_jl_") - from_jl_string(s[4:end]) + from_jl_string(s[9:end]) else error("Cannot parse type $s from non-Julia language") end - elseif haskey(TYPE_TO_STR, s) - TYPE_TO_STR[s] + elseif haskey(STR_TO_TYPE, s) + STR_TO_TYPE[s] else error("Type not supported. You may need to update to the latest version of Banyan or declare the data/sample/metadata you are accessing invalid.") end diff --git a/Banyan/src/utils_pfs.jl b/Banyan/src/utils_pfs.jl index 9def168c..fe0aaabb 100644 --- a/Banyan/src/utils_pfs.jl +++ b/Banyan/src/utils_pfs.jl @@ -510,7 +510,7 @@ function getpath(path::String)::String # disk if it doesn't fit in free memory # TODO: Add option for Internet locations as to whether or not to # cache on disk - hashed_path = get_remotepath_id(path) + hashed_path = LocationPath(path).path_hash joined_path = "efs/job_$(Banyan.get_session().resource_id)_dataset_$(hashed_path)_$(MPI.Comm_rank(MPI.COMM_WORLD))" # @info "Downloading $path to $joined_path" # if MPI.Comm_rank(comm) == 0 diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 3581365e..1300aea7 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -32,8 +32,7 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))" mkpath(sample_dir) sample_path = "$sample_dir/$sample_rate" - @show sample_path - @show sample_rate + println("In _remote_table_source at start with readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))") # Get metadata if it is still valid curr_meta::Arrow.Table = if !curr_metadata_invalid @@ -383,6 +382,7 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location ) end + println("In _remote_table_source with curr_sample_invalid=$curr_sample_invalid for writing to $sample_path and readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))") # Write the sample to S3 cache if previously invalid if curr_sample_invalid write(sample_path, remote_sample.value) diff --git a/BanyanDataFrames/src/precompile.jl b/BanyanDataFrames/src/precompile.jl index 17ecc87a..30d5d270 100644 --- a/BanyanDataFrames/src/precompile.jl +++ b/BanyanDataFrames/src/precompile.jl @@ -191,7 +191,7 @@ function _precompile_() end # locations.jl - precompile(_remote_table_source, (String, Bool, Bool, Bool, Bool, Bool)) + precompile(_remote_table_source, (LocationPath, Location)) # df.jl precompile(Banyan.orderinghashes, (DataFrames.DataFrame, String)) @@ -298,9 +298,6 @@ function _precompile_() precompile(Arrow.write, (String,)) precompile(Arrow.write, (DataFrames.DataFrame,)) - # locations.jl - precompile(_remote_table_source, (String, Bool, Bool, Bool, Bool, Bool, Int64)) - # TODO: Maybe run code here to precompile # df = Future() diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index e9aec975..4f8b102a 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -116,6 +116,7 @@ end @show get_sampling_configs() @test get_sample_rate(p2) == 5 @test has_metadata(p2) + # NOTE: We don't compute _exact_ samples on writing @test has_sample(p2) == !exact_sample invalidate_metadata(p2) @test !has_metadata(p2) @@ -126,7 +127,7 @@ end @show get_sample_rate(p2) df2 = read_table(p2) - @show Banyan.get_location_path_with_format(p2) + @show Banyan.LocationPath(p2) @show get_sampling_configs() @show get_sampling_config(p2) @show get_sample_rate(p2) diff --git a/BanyanHDF5/Project.toml b/BanyanHDF5/Project.toml index 4b89c068..425f90e7 100644 --- a/BanyanHDF5/Project.toml +++ b/BanyanHDF5/Project.toml @@ -4,6 +4,7 @@ authors = ["Caleb Winston "] version = "0.2.1" [deps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317" BanyanArrays = "369465de-032e-4609-9dcf-82b89c370a7b" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" @@ -12,6 +13,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [compat] +Arrow = "2" Banyan = "0.4.1" BanyanArrays = "0.4.1" HDF5 = "^0.16" diff --git a/BanyanHDF5/src/BanyanHDF5.jl b/BanyanHDF5/src/BanyanHDF5.jl index 10d13816..a215f710 100644 --- a/BanyanHDF5/src/BanyanHDF5.jl +++ b/BanyanHDF5/src/BanyanHDF5.jl @@ -1,6 +1,7 @@ module BanyanHDF5 -using Banyan, +using Arrow, + Banyan, BanyanArrays, HDF5, MPI, diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl index ecd529b4..63446fa1 100644 --- a/BanyanHDF5/src/hdf5.jl +++ b/BanyanHDF5/src/hdf5.jl @@ -4,7 +4,6 @@ function read_hdf5(path; kwargs...) A_loc.src_name == "Remote" || error("$path does not exist") invalidate(path; after=true, kwargs...) A = Future(datatype="Array", source=A_loc) - A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"]) A_loc_eltype = Banyan.type_from_str(A_loc.src_parameters["eltype"]) A_loc_size = Banyan.size_from_str(A_loc.src_parameters["size"]) A_loc_ndims = length(A_loc_size) diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index 585da17c..fd85b1d4 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -30,8 +30,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location) sc = get_sampling_config(lp) path_and_subpath = lp.path shuffled = sc.assume_shuffled - curr_metadata_invalid = loc.metadata_invalid - curr_sample_invalid = loc.sample_invalid + curr_metadata_invalid, curr_sample_invalid = loc.metadata_invalid, loc.sample_invalid # Get session information sample_rate = sc.rate @@ -144,8 +143,8 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location) "path_and_subpath" => path_and_subpath, "path" => remotepath, "subpath" => datasetpath, - "eltype" => Banyan.size_to_str(dataszie), - "size" => Banyan.type_to_str(dataeltype), + "eltype" => Banyan.type_to_str(dataeltype), + "size" => Banyan.size_to_str(datasize), "sample_memory_usage" => string(nbytes), "format" => "hdf5" ) @@ -160,8 +159,12 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location) sample_path = "$sample_dir/$sample_rate" # Store metadata and sample in S3 - Arrow.write(metadata_path, Arrow.Table(); metadata=src_params) - serialize(sample_path, dset_sample) + if curr_metadata_invalid + Arrow.write(metadata_path, Arrow.Table(); metadata=src_params) + end + if curr_sample_invalid + serialize(sample_path, dset_sample) + end # Return Location to client side LocationSource("Remote", src_params, nbytes, dset_sample) diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl index 23ac8254..0cd0c566 100644 --- a/BanyanHDF5/test/hdf5.jl +++ b/BanyanHDF5/test/hdf5.jl @@ -33,19 +33,23 @@ end # TODO: Add tests here modeled after BDF.jl -@testset "Reading and sampling HDF5 in $src with $scheduling_config with max_num_bytes_exact=$max_num_bytes and shuffled=$shuffled" for scheduling_config in [ - "default scheduling", - "parallelism encouraged", - "parallelism and batches encouraged", -], -src in ["Internet", "S3"], -max_num_bytes in [0, Banyan.parse_bytes("100 GB")], -shuffled in [true, false] +@testset "Reading $(shuffled ? "shuffled " : "")$src data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for + scheduling_config in [ + "default scheduling", + "parallelism encouraged", + "parallelism and batches encouraged", + ], + src in ["Internet", "S3"], + max_num_bytes in [0, 100_000_000_000], + shuffled in [true, false] + get_organization_id() - use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do - invalidate_all_locations() + use_session_for_testing(scheduling_config_name = scheduling_config) do use_data() - configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled) + configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled, for_all_locations=true, default=true) + exact_sample = max_num_bytes > 0 + + invalidate_all_locations() p = if src == "S3" joinpath("s3://", get_cluster_s3_bucket_name(), "fillval.h5/DS1") @@ -53,12 +57,16 @@ shuffled in [true, false] joinpath("https://github.com/banyan-team/banyan-julia/raw/v0.1.1/BanyanArrays/test/res", "fillval.h5/DS1") end - x = read_hdf5(p) - sample(x) - @show get_sample_rate(x) + df = read_hdf5(p; metadata_invalid=true, invalidate_samples=true) + sample(df) + @show max_num_bytes + @show exact_sample + @show get_sample_rate(p) configure_sampling(p; sample_rate=5) - x = read_hdf5(p) + @show get_sampling_configs() + read_hdf5(p) + @show get_sampling_configs() @test get_sample_rate(p) == 5 @test has_metadata(p) @test has_sample(p) @@ -69,30 +77,40 @@ shuffled in [true, false] @test !has_metadata(p) @test !has_sample(p) - x = read_hdf5(p) @show get_sample_rate(p) - sample(x) + df2 = read_hdf5(p) + @show Banyan.LocationPath(p) + @show get_sampling_configs() + @show get_sampling_config(p) @show get_sample_rate(p) - x = read_hdf5(p; samples_invalid=true) - sample(x) + sample(df2) + @show get_sample_rate(p) + df2 = read_hdf5(p; samples_invalid=true) + sample(df2) + @test get_sample_rate(p) == 5 configure_sampling(sample_rate=7, for_all_locations=true) - x = read_hdf5(p; metadata_invalid=true) - sample(x) @test get_sample_rate(p) == 5 + df2 = read_hdf5(p; metadata_invalid=true) + sample(df2) + @test get_sample_rate(p) == 5 + println("Bad get_sample_rate") @test get_sample_rate() == 7 - configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) + configure_sampling(sample_rate=7, for_all_locations=true) @test get_sample_rate(p) == 5 + println("Bad get_sample_rate") + configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) + @test get_sample_rate(p) == 7 @test get_sample_rate() == 7 - x = read_hdf5(p) + df2 = read_hdf5(p) @test get_sample_rate(p) == 7 @test get_sample_rate() == 7 - x = read_hdf5(p; location_invalid=true) - sample(x) + df2 = read_hdf5(p; location_invalid=true) + sample(df2) @test has_metadata(p) @test has_sample(p) @show get_sample_rate(p) configure_sampling(p; always_exact=true) - sample(x) + sample(df2) end end diff --git a/BanyanImages/Project.toml b/BanyanImages/Project.toml index cf7cbabd..87afc272 100644 --- a/BanyanImages/Project.toml +++ b/BanyanImages/Project.toml @@ -14,6 +14,7 @@ ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] diff --git a/BanyanImages/src/BanyanImages.jl b/BanyanImages/src/BanyanImages.jl index 3d08dc2e..4bebbe4d 100644 --- a/BanyanImages/src/BanyanImages.jl +++ b/BanyanImages/src/BanyanImages.jl @@ -2,7 +2,7 @@ module BanyanImages using Banyan, BanyanArrays -using Arrow, FileIO, ImageCore, ImageIO, MPI, ProgressMeter, Random, Tables +using Arrow, FileIO, ImageCore, ImageIO, MPI, ProgressMeter, Random, Serialization, Tables export read_png, # write_png, read_jpg #, write_jpg diff --git a/BanyanImages/src/image.jl b/BanyanImages/src/image.jl index b413a9ce..b309ea3b 100644 --- a/BanyanImages/src/image.jl +++ b/BanyanImages/src/image.jl @@ -1,11 +1,11 @@ -function read_png(path; add_channelview=false) +function read_png(path; add_channelview=false, kwargs...) invalidate(path; kwargs...) image_loc = RemoteImageSource(path, add_channelview) image_loc.src_name == "Remote" || error("$path does not exist") invalidate(path; after=true, kwargs...) image = Future(;source=image_loc, datatype="Array") - image_loc_eltype = type_from_str(image_loc.src_parameters["eltype"]) - image_loc_size = size_from_str(image_loc.src_parameters["size"]) + image_loc_eltype = Banyan.type_from_str(image_loc.src_parameters["eltype"]) + image_loc_size = Banyan.size_from_str(image_loc.src_parameters["size"]) image_loc_ndims = length(image_loc_size) BanyanArrays.Array{image_loc_eltype,image_loc_ndims}(image, Future(image_loc_size)) end diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index 3bd07e40..ac6a180c 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -369,18 +369,18 @@ function _remote_image_source(lp::LocationPath, loc::Location, remotepath, add_c "name" => "Remote", "nimages" => string(nimages), "sample_memory_usage" => string(nbytes_res), # NOTE: We assume all files have same size - "size" => size_to_str(datasize_res), - "eltype" => type_to_str(dataeltype_res), + "size" => Banyan.size_to_str(datasize_res), + "eltype" => Banyan.type_to_str(dataeltype_res), "add_channelview" => add_channelview ? "1" : "0", "format" => "image" ) else - curr_location.src_parameters + Banyan.get_src_params_dict_from_arrow(metadata_path) end # Store metadata and sample in S3 if curr_metadata_invalid - Arrow.write(metadata_path, (path=localpaths,); metadata=src_params) + Arrow.write(metadata_path, (path=localpaths,); metadata=src_parameters) end if curr_sample_invalid serialize(sample_path, remote_sample) @@ -395,11 +395,7 @@ end RemoteImageSource(remotepath, add_channelview)::Location = RemoteSource( - LocationPath( - remotepath isa String ? remotepath : "lang_jl_$(hash(remotepath))", - add_channelview ? "jl_channelview" : "jl", - Banyan.get_julia_version() - ), + LocationPath(remotepath; add_channelview=add_channelview), _remote_image_source, deserialize, identity, diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl index ab982b79..b19bdba5 100644 --- a/BanyanImages/test/jpg.jl +++ b/BanyanImages/test/jpg.jl @@ -79,65 +79,86 @@ invalid_bool_to_str(metadata_invalid) = metadata_invalid ? "invalid" : "valid" end end -@testset "Reading and sampling $nimage JPG images on $loc with $format and add_channelview=$add_channelview, max_num_bytes=$max_num_bytes, shuffled=$shuffled" for +@testset "Reading and sampling $nimages $(shuffled ? "shuffled" : "") JPG images on $loc with $format and add_channelview=$add_channelview with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for + scheduling_config in [ + "default scheduling", + "parallelism encouraged", + "parallelism and batches encouraged", + ], (loc, format) in [ ("Internet", "generator"), ("S3", "generator"), ("S3", "directory") ], - max_num_bytes in [0, Banyan.parse_bytes("100 GB")], + max_num_bytes in [0, 100_000_000_000], shuffled in [true, false], nimages in [1, 50], add_channelview in [true, false] + get_organization_id() use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do - bucket_name = get_cluster_s3_bucket_name() + configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled, for_all_locations=true, default=true) + exact_sample = max_num_bytes > 0 + invalidate_all_locations() - configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled) + bucket_name = get_cluster_s3_bucket_name() p = get_test_path(loc, "generator", "jpg", nimages, bucket_name) - x = read_jpg(p; add_channelview=add_channelview) - sample(x) - @show get_sample_rate(x) - - # TODO: Ensure that this triggers parallel cluster<->client data transfer - configure_sampling(p; sample_rate=20) - x = read_jpg(p; add_channelview=add_channelview) - @test get_sample_rate(p) == 20 - @test has_metadata(p) - @test has_sample(p) - invalidate_metadata(p) - @test !has_metadata(p) - @test has_sample(p) - invalidate_location(p) - @test !has_metadata(p) - @test !has_sample(p) - - x = read_jpg(p; add_channelview=add_channelview) - @show get_sample_rate(p) - sample(x) - @show get_sample_rate(p) - x = read_jpg(p; add_channelview=add_channelview, samples_invalid=true) - sample(x) + df = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true, invalidate_samples=true) + sample(df) + @show max_num_bytes + @show exact_sample + @show get_sample_rate(p; add_channelview=add_channelview) + + configure_sampling(p; sample_rate=50) + @show get_sampling_configs() + read_jpg(p; add_channelview=add_channelview) + @show get_sampling_configs() + @test get_sample_rate(p; add_channelview=add_channelview) == 50 + @test has_metadata(p; add_channelview=add_channelview) + @test has_sample(p; add_channelview=add_channelview) + invalidate_metadata(p; add_channelview=add_channelview) + @test !has_metadata(p; add_channelview=add_channelview) + @test has_sample(p; add_channelview=add_channelview) + invalidate_location(p; add_channelview=add_channelview) + @test !has_metadata(p; add_channelview=add_channelview) + @test !has_sample(p; add_channelview=add_channelview) + + @show get_sample_rate(p; add_channelview=add_channelview) + df2 = read_jpg(p; add_channelview=add_channelview) + @show Banyan.LocationPath(p; add_channelview=add_channelview) + @show get_sampling_configs() + @show get_sampling_config(p; add_channelview=add_channelview) + @show get_sample_rate(p; add_channelview=add_channelview) + sample(df2) + @show get_sample_rate(p; add_channelview=add_channelview) + df2 = read_jpg(p; add_channelview=add_channelview, samples_invalid=true) + sample(df2) + @test get_sample_rate(p; add_channelview=add_channelview) == 50 configure_sampling(sample_rate=75, for_all_locations=true) - x = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true) - sample(x) - @test get_sample_rate(p) == 50 + @test get_sample_rate(p; add_channelview=add_channelview) == 50 + df2 = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true) + sample(df2) + @test get_sample_rate(p; add_channelview=add_channelview) == 50 + println("Bad get_sample_rate") @test get_sample_rate() == 75 + configure_sampling(sample_rate=75, for_all_locations=true) + @test get_sample_rate(p; add_channelview=add_channelview) == 50 + println("Bad get_sample_rate") configure_sampling(sample_rate=75, force_new_sample_rate=true, for_all_locations=true) - @test get_sample_rate(p) == 50 + @test get_sample_rate(p; add_channelview=add_channelview) == 75 @test get_sample_rate() == 75 - x = read_jpg(p; add_channelview=add_channelview) - @test get_sample_rate(p) == 75 + df2 = read_jpg(p; add_channelview=add_channelview) + @test get_sample_rate(p; add_channelview=add_channelview) == 75 @test get_sample_rate() == 75 - x = read_jpg(p; add_channelview=add_channelview, location_invalid=true) - sample(x) - @test has_metadata(p) - @test has_sample(p) - @show get_sample_rate(p) - configure_sampling(p; always_exact=true) - sample(x) + df2 = read_jpg(p; add_channelview=add_channelview, location_invalid=true) + sample(df2) + @test has_metadata(p; add_channelview=add_channelview) + @test has_sample(p; add_channelview=add_channelview) + @show get_sample_rate(p; add_channelview=add_channelview) + configure_sampling(p; add_channelview=add_channelview, always_exact=true) + sample(df2) end end diff --git a/BanyanImages/test/runtests.jl b/BanyanImages/test/runtests.jl index 695a4f22..58b1eb6b 100644 --- a/BanyanImages/test/runtests.jl +++ b/BanyanImages/test/runtests.jl @@ -9,7 +9,7 @@ MPI.Init() # Create a dummy test session for unit tests test_session_id = "test_session_id" test_resource_id = "test_resource_id" -Banyan.sessions[test_session_id] = Session(ENV["BANYAN_CLUSTER_NAME"], test_session_id, test_resource_id, 2, 2) +Banyan.sessions[test_session_id] = Session(ENV["BANYAN_CLUSTER_NAME"], test_session_id, test_resource_id, 2) global sessions_for_testing = Dict() From c627846d2530c533f6ccdc103b482f3dd51f9400 Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Thu, 18 Aug 2022 12:32:05 -0400 Subject: [PATCH 21/25] Fix new sampling system for BanyanImages --- BanyanHDF5/src/locations.jl | 2 +- BanyanImages/src/locations.jl | 10 +++++++--- BanyanImages/test/jpg.jl | 10 ++++++---- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl index fd85b1d4..a725b6b6 100644 --- a/BanyanHDF5/src/locations.jl +++ b/BanyanHDF5/src/locations.jl @@ -90,7 +90,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location) # aggregate and concatenate it on the main worker rand_indices_range = split_len(datalength, worker_idx, nworkers) rand_indices = sample_from_range(rand_indices_range, sample_rate) - exact_sample_needed = nbytes < sc.max_num_bytes_exact + exact_sample_needed = nbytes < sc.max_num_bytes_exact || sc.always_exact remaining_colons = Base.fill(Colon(), datandims-1) dset_sample_value = if !exact_sample_needed samples_on_workers = gather_across( diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl index ac6a180c..463fcae2 100644 --- a/BanyanImages/src/locations.jl +++ b/BanyanImages/src/locations.jl @@ -308,9 +308,13 @@ function _remote_image_source(lp::LocationPath, loc::Location, remotepath, add_c # Read in images on each worker. We need to read in at least one image # regardless of whether we want to get the sample or the metadata _load_img = add_channelview ? _load_image_and_add_channelview : _load_image - first_img = is_main ? (localpaths[1] |> _load_img |> _reshape_image) : nothing - exact_sample_needed = is_main ? ((sample_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false - exact_sample_needed = sync_across(exact_sample_needed) + first_img = is_main ? (localpaths[1] |> getpath |> _load_img |> _reshape_image) : nothing + exact_sample_needed = if sc.always_exact + true + else + esn = is_main ? ((sample_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false + sync_across(esn) + end need_to_parallelize = nimages >= 10 total_num_images_to_read_in = if curr_sample_invalid exact_sample_needed ? nimages : cld(nimages, sc.rate) diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl index b19bdba5..32861306 100644 --- a/BanyanImages/test/jpg.jl +++ b/BanyanImages/test/jpg.jl @@ -90,10 +90,12 @@ end ("S3", "generator"), ("S3", "directory") ], - max_num_bytes in [0, 100_000_000_000], - shuffled in [true, false], - nimages in [1, 50], - add_channelview in [true, false] + (max_num_bytes, nimages, add_channelview) in [ + (0, 1, false), + (0, 50, true), + (100_000_000_000, 1, true) + ], + shuffled in [true, false] get_organization_id() use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do From 2ae6477b0fae63f51eccd30cc7383cfdf99ae26e Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Thu, 18 Aug 2022 14:09:58 -0400 Subject: [PATCH 22/25] Remove print statements --- Banyan/src/clusters.jl | 1 - Banyan/src/location.jl | 31 ------------------ Banyan/src/locations.jl | 32 ------------------ Banyan/src/queues.jl | 12 ------- Banyan/src/requests.jl | 21 ------------ Banyan/src/utils.jl | 4 --- BanyanDataFrames/src/locations.jl | 15 --------- BanyanDataFrames/src/pfs.jl | 38 ++-------------------- BanyanDataFrames/test/sample_collection.jl | 16 --------- BanyanHDF5/src/pfs.jl | 4 --- BanyanHDF5/test/hdf5.jl | 14 -------- BanyanImages/test/jpg.jl | 31 +++++------------- BanyanImages/test/runtests.jl | 1 + 13 files changed, 11 insertions(+), 209 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 2e7572c3..eabb1215 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -195,7 +195,6 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster} if !isempty(cluster_name) filters["cluster_name"] = cluster_name end - @show filters response = send_request_get_response(:describe_clusters, Dict{String,Any}("filters"=>filters)) clusters_dict::Dict{String,Cluster} = Dict{String,Cluster}() for (name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any} diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl index 4b9458d2..b713b55c 100644 --- a/Banyan/src/location.jl +++ b/Banyan/src/location.jl @@ -120,7 +120,6 @@ function parse_sample_rate(object_key) end function get_sample_rate(l_path::LocationPath) sc = get_sampling_config(l_path) - @show sc # Get the desired sample rate desired_sample_rate = sc.rate @@ -138,9 +137,6 @@ function get_sample_rate(l_path::LocationPath) banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())") banyan_samples_object_dir = joinpath(banyan_samples_bucket, get_sample_path_prefix(l_path)) sample_rate = -1 - @show banyan_samples_object_dir - @show readdir(banyan_samples_bucket) - @show readdir_no_error(banyan_samples_object_dir) for object_key in readdir_no_error(banyan_samples_object_dir) object_sample_rate = parse(Int64, object_key) object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) @@ -157,7 +153,6 @@ end has_metadata(p=""; kwargs...) = has_metadata(LocationPath(p; kwargs...)) function has_metadata(l_path:: LocationPath)::Bool - println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())") isfile(S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(l_path))")) end @@ -166,20 +161,6 @@ has_sample(p=""; kwargs...) = function has_sample(l_path:: LocationPath)::Bool sc = get_sampling_config(l_path) banyan_sample_dir = S3Path("s3://$(banyan_samples_bucket_name())/$(get_sample_path_prefix(l_path))") - println("In has_sample") - @show sc - @show sc.force_new_sample_rate - @show joinpath(banyan_sample_dir, string(sc.rate)) - @show isdir_no_error(banyan_sample_dir) - @show isdir_no_error(banyan_sample_dir) && !isempty(readdir(banyan_sample_dir)) - @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/")) - @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2")) - @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2/")) - @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2")) - @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arr/")) - @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_200/")) - @show banyan_sample_dir - @show readdir_no_error(banyan_sample_dir) if sc.force_new_sample_rate isfile(joinpath(banyan_sample_dir, string(sc.rate))) else @@ -322,8 +303,6 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT" sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix/$sample_rate" try - @show sample_local_path - @show sample_s3_path blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))) write(sample_local_path, seekstart(blob.io)) # This overwrites the existing file final_local_sample_path = sample_local_path @@ -352,7 +331,6 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix) if isempty(final_local_sample_path) final_sample_rate = -1 - @show readdir_no_error(banyan_samples_object_dir) for object_key in readdir_no_error(banyan_samples_object_dir) object_sample_rate = parse(Int64, object_key) object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate) @@ -372,7 +350,6 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} Path(final_local_sample_path) ) end - @show readdir_no_error(banyan_samples_object_dir) end # Construct and return LocationSource @@ -384,15 +361,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String} ) res_location.metadata_invalid = isempty(src_params) res_location.sample_invalid = isempty(final_local_sample_path) - @show res_location - @show final_sample_rate - @show final_local_sample_path final_sample_rate = isempty(final_local_sample_path) ? desired_sample_rate : final_sample_rate - @show desired_sample_rate - @show sample_local_dir - @show readdir(sample_local_dir) - println("At end of get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") - ( res_location, metadata_local_path, diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl index 0a3e3525..4e5a091b 100644 --- a/Banyan/src/locations.jl +++ b/Banyan/src/locations.jl @@ -303,7 +303,6 @@ function invalidate_metadata(p; kwargs...) end # Delete from S3 - println("Deleting get_metadata_path(lp)=$(get_metadata_path(lp))") s3p = S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))") if isfile(s3p) rm(s3p) @@ -325,16 +324,9 @@ function invalidate_samples(p; kwargs...) # Delete from S3 s3p = S3Path("s3://$(banyan_samples_bucket_name())/$sample_path_prefix") - @show readdir_no_error(s3p) - @show s3p - @show path_as_dir(s3p) - @show readdir(S3Path("s3://$(banyan_samples_bucket_name())")) if !isempty(readdir_no_error(s3p)) rm(path_as_dir(s3p), recursive=true) end - @show readdir_no_error(s3p) - @show s3p - @show readdir(S3Path("s3://$(banyan_samples_bucket_name())")) end function invalidate_location(p; kwargs...) invalidate_metadata(p; kwargs...) @@ -432,13 +424,6 @@ function RemoteSource( # Look at local and S3 caches of metadata and samples to attempt to # construct a Location. loc, local_metadata_path, local_sample_path = get_location_source(lp) - let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") - println("Before get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir)) and loc.metadata_invalid=$(loc.metadata_invalid) and loc.sample_invalid=$(loc.sample_invalid)") - end - @show lp - @show get_sampling_configs() - @show local_sample_path - @show loc res = if !loc.metadata_invalid && !loc.sample_invalid # Case where both sample and parameters are valid @@ -447,21 +432,8 @@ function RemoteSource( loc elseif loc.metadata_invalid && !loc.sample_invalid # Case where parameters are invalid - let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") - println("Before offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") - end - let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b") - println("Before offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))") - end new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true) - let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") - println("After offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") - end - let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b") - println("After offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))") - end Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters) - @show new_loc new_loc.sample.value = load_sample(local_sample_path) new_loc else @@ -469,7 +441,6 @@ function RemoteSource( # Get the Location with up-to-date metadata (source parameters) and sample new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true) - @show new_loc if !loc.metadata_invalid # Store the metadata locally. The local copy just has the source @@ -484,8 +455,5 @@ function RemoteSource( new_loc end - let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3") - println("At end of RemoteSource with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))") - end res end \ No newline at end of file diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl index df6ba120..6e2c16d9 100644 --- a/Banyan/src/queues.jl +++ b/Banyan/src/queues.jl @@ -32,8 +32,6 @@ function get_next_message( end end m_dict = m["ReceiveMessageResult"]["Message"] - @show m_dict["MessageId"] - @show m_dict["ReceiptHandle"] if delete SQS.delete_message(queue_url, m_dict["ReceiptHandle"]::String) end @@ -150,16 +148,9 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) end end - for (i, pm) in enumerate(message_ranges) - if i > 1 - println("pm == partial_messages[i-1] = $(message[pm] == message[message_ranges[i-1]])") - end - end - # Launch asynchronous threads to send SQS messages gather_q_url = gather_queue_url() num_chunks = length(message_ranges) - @show num_chunks if num_chunks > 1 @sync for i = 1:num_chunks @async begin @@ -181,9 +172,6 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0) "MessageDeduplicationId" => generated_message_id * string(i) ) ) - @show i - @show message_ranges[i] - @show length(message[message_ranges[i]]) end end else diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 66720fa9..8578786d 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -291,8 +291,6 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n if is_debug_on() println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") end - - @show num_chunks whole_message_contents = if num_chunks > 1 partial_messages = Vector{String}(undef, num_chunks) @@ -301,11 +299,9 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n @async begin partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing) chunk_idx = partial_message["chunk_idx"] - @show chunk_idx partial_messages[chunk_idx] = partial_message["contents"] end end - @show length.(partial_messages) join(partial_messages) else message["contents"] @@ -717,34 +713,17 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client") end - @show num_chunks - whole_message_contents = if num_chunks > 1 partial_messages = fill("", num_chunks) partial_messages[message["chunk_idx"]] = message["contents"] - @show message["chunk_idx"] @sync for _ = 1:num_remaining_chunks @async begin let partial_message = sqs_receive_next_message(gather_queue, p, nothing, nothing)[1] chunk_idx = partial_message["chunk_idx"] partial_messages[chunk_idx] = partial_message["contents"] - @show chunk_idx - @show length(partial_message["contents"]) - @show partial_message["contents_length"] - @show length(partial_messages[chunk_idx]) - @show last(partial_message["contents"], 20) - @show last(partial_messages[chunk_idx], 20) - @show length.(partial_messages) end end end - # TODO: Fix this so that it gets the partial messages which are different lengths - @show length.(partial_messages) - for (i, pm) in enumerate(partial_messages) - if i > 1 - println("pm == partial_messages[i-1] = $(pm == partial_messages[i-1])") - end - end join(partial_messages) else message["contents"] diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index ac93c999..cb6504a4 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -610,8 +610,6 @@ TYPE_TO_STR = STR_TO_TYPE = invert(TYPE_TO_STR) function type_to_str(ty::DataType)::String - @show ty - @show TYPE_TO_STR global TYPE_TO_STR if haskey(TYPE_TO_STR, ty) TYPE_TO_STR[ty] @@ -621,8 +619,6 @@ function type_to_str(ty::DataType)::String end function type_from_str(s::String) - @show s - @show STR_TO_TYPE if startswith(s, "lang_") if startswith(s, "lang_jl_") from_jl_string(s[9:end]) diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl index 1300aea7..b67d08d0 100644 --- a/BanyanDataFrames/src/locations.jl +++ b/BanyanDataFrames/src/locations.jl @@ -4,11 +4,6 @@ Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5)) function _remote_table_source(lp::LocationPath, loc::Location)::Location sampling_config = get_sampling_config(lp) - metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") - metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") - haskey(s3_res, "Contents") ? s3_res["Contents"] : [] - end - println("In _remote_table_source at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir") # Setup for sampling remotepath = lp.path @@ -32,7 +27,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))" mkpath(sample_dir) sample_path = "$sample_dir/$sample_rate" - println("In _remote_table_source at start with readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))") # Get metadata if it is still valid curr_meta::Arrow.Table = if !curr_metadata_invalid @@ -348,12 +342,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location # If a file does not exist, one of the get_metadata/get_sample functions # will error. - metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") - metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") - haskey(s3_res, "Contents") ? s3_res["Contents"] : [] - end - println("In _remote_table_source at end with metadata_dir=$metadata_dir and metadata_bucket_dir=$metadata_bucket_dir and metadata_path=$metadata_path and curr_metadata_invalid=$curr_metadata_invalid") - # Get source parameters src_params = Dict( @@ -382,7 +370,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location ) end - println("In _remote_table_source with curr_sample_invalid=$curr_sample_invalid for writing to $sample_path and readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))") # Write the sample to S3 cache if previously invalid if curr_sample_invalid write(sample_path, remote_sample.value) @@ -392,8 +379,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location @show (remotepath, meta_path) end - # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)") - # Return LocationSource to client specified # Construct the `Location` to return diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl index c1ec11be..bac01b39 100644 --- a/BanyanDataFrames/src/pfs.jl +++ b/BanyanDataFrames/src/pfs.jl @@ -335,7 +335,7 @@ function ReadBlockHelper(@nospecialize(format_value)) dfs = Base.Vector{Any}(undef, ndfs) if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND - @show (filezs_to_read, get_worker_idx()) + @show (files_to_read, get_worker_idx()) end # Iterate through files and identify which ones correspond to the range of @@ -387,13 +387,7 @@ function WriteHelper(@nospecialize(format_value)) comm::MPI.Comm, loc_name::String, loc_params::Dict{String,Any}, - ) - metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") - metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") - haskey(s3_res, "Contents") ? s3_res["Contents"] : [] - end - println("In Write at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir") - + ) # Get rid of splitting divisions if they were used to split this data into # groups splitting_divisions = Banyan.get_splitting_divisions() @@ -545,12 +539,6 @@ function WriteHelper(@nospecialize(format_value)) # On the main worker, finalize metadata and location info. sample_invalid = false if is_main - metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") - metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b") - haskey(s3_res, "Contents") ? s3_res["Contents"] : [] - end - println("In Write with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir") - # Determine paths and #s of rows for metadata file for worker_i in 1:nworkers push!( @@ -591,42 +579,21 @@ function WriteHelper(@nospecialize(format_value)) sample_invalid = true end - println("In Write with sample_invalid=$sample_invalid (because sample_memory_usage=$sample_memory_usage and sampling_config.max_num_bytes_exact=$(sampling_config.max_num_bytes_exact)) and while sampling_config=$sampling_config, writing to $m_path and $s_path, on batch_idx=$batch_idx with curr_src_parameters=$curr_src_parameters") - - @show get_sampling_configs() - @show lp - @show get_sampling_config(lp) - @show s_path - @show s_sample_dir - # Get the actual sample by concatenating if !sample_invalid sampled_parts = [gathered[4] for gathered in gathered_data] if batch_idx > 1 push!(sampled_parts, Arrow.Table(s_path) |> DataFrames.DataFrame) end - println("Writing to s_path=$s_path") Arrow.write(s_path, vcat(sampled_parts...), compress=:zstd) else - println("Removing s_path=$s_path") rm(s_path, force=true, recursive=true) end # Determine paths for this batch and gather # of rows - @show m_path - @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b/") - @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b") - bucket_dir = readdir("s3/$(banyan_metadata_bucket_name())") - println("On main in $(banyan_metadata_bucket_name()): $bucket_dir") Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows); compress=:zstd, metadata=curr_src_parameters) end - @show readdir("s3/$(banyan_metadata_bucket_name())") - @show Banyan.S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"] - - println("In Write") - @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/")) - ################################### # Handling Final Batch by Copying # ################################### @@ -640,7 +607,6 @@ function WriteHelper(@nospecialize(format_value)) cp(m_path, actual_meta_path, force=true) if !sample_invalid mkpath(actual_sample_dir) - println("Copying from s_path=$s_path to actual_sample_path=$actual_sample_path") cp(s_path, actual_sample_path, force=true) end end diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl index 4f8b102a..f160f9fd 100644 --- a/BanyanDataFrames/test/sample_collection.jl +++ b/BanyanDataFrames/test/sample_collection.jl @@ -101,19 +101,12 @@ end p1 = "s3://$(bucket)/iris_large.$format" p2 = "s3://$(bucket)/iris_large_tmp.$format" - println("has_sample(p2)=$(has_sample(p2)) after invalidation") df = read_table(p1; metadata_invalid=true, invalidate_samples=true) sample(df) - @show max_num_bytes - @show exact_sample - @show get_sample_rate(p1) configure_sampling(p2; sample_rate=5) - println("Before write_table") - @show get_sampling_configs() write_table(df, p2) - @show get_sampling_configs() @test get_sample_rate(p2) == 5 @test has_metadata(p2) # NOTE: We don't compute _exact_ samples on writing @@ -125,21 +118,13 @@ end @test !has_metadata(p2) @test !has_sample(p2) - @show get_sample_rate(p2) df2 = read_table(p2) - @show Banyan.LocationPath(p2) - @show get_sampling_configs() - @show get_sampling_config(p2) - @show get_sample_rate(p2) sample(df2) - @show get_sample_rate(p2) df2 = read_table(p2; samples_invalid=true) sample(df2) @test get_sample_rate(p2) == 5 - println("After bad get_sample_rate") configure_sampling(sample_rate=7, for_all_locations=true) @test get_sample_rate(p2) == 5 - println("After bad get_sample_rate") df2 = read_table(p2; metadata_invalid=true) sample(df2) @test get_sample_rate(p2) == 5 @@ -156,7 +141,6 @@ end sample(df2) @test has_metadata(p2) @test has_sample(p2) - @show get_sample_rate(p2) configure_sampling(p2; always_exact=true) sample(df2) end diff --git a/BanyanHDF5/src/pfs.jl b/BanyanHDF5/src/pfs.jl index 73456013..4c69264d 100644 --- a/BanyanHDF5/src/pfs.jl +++ b/BanyanHDF5/src/pfs.jl @@ -603,10 +603,6 @@ function WriteHelperHDF5( fsync_file(path) MPI.Barrier(comm) end - if true#is_main - f = h5open("/home/ec2-user/s3/banyan-cluster-data-test-lustre-0ce21f27/fillval.h5", "r+", comm, info) - close(f) - end nothing end diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl index 0cd0c566..c1575f0a 100644 --- a/BanyanHDF5/test/hdf5.jl +++ b/BanyanHDF5/test/hdf5.jl @@ -59,14 +59,9 @@ end df = read_hdf5(p; metadata_invalid=true, invalidate_samples=true) sample(df) - @show max_num_bytes - @show exact_sample - @show get_sample_rate(p) configure_sampling(p; sample_rate=5) - @show get_sampling_configs() read_hdf5(p) - @show get_sampling_configs() @test get_sample_rate(p) == 5 @test has_metadata(p) @test has_sample(p) @@ -77,14 +72,8 @@ end @test !has_metadata(p) @test !has_sample(p) - @show get_sample_rate(p) df2 = read_hdf5(p) - @show Banyan.LocationPath(p) - @show get_sampling_configs() - @show get_sampling_config(p) - @show get_sample_rate(p) sample(df2) - @show get_sample_rate(p) df2 = read_hdf5(p; samples_invalid=true) sample(df2) @test get_sample_rate(p) == 5 @@ -93,11 +82,9 @@ end df2 = read_hdf5(p; metadata_invalid=true) sample(df2) @test get_sample_rate(p) == 5 - println("Bad get_sample_rate") @test get_sample_rate() == 7 configure_sampling(sample_rate=7, for_all_locations=true) @test get_sample_rate(p) == 5 - println("Bad get_sample_rate") configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true) @test get_sample_rate(p) == 7 @test get_sample_rate() == 7 @@ -108,7 +95,6 @@ end sample(df2) @test has_metadata(p) @test has_sample(p) - @show get_sample_rate(p) configure_sampling(p; always_exact=true) sample(df2) end diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl index 32861306..fff3793a 100644 --- a/BanyanImages/test/jpg.jl +++ b/BanyanImages/test/jpg.jl @@ -85,15 +85,14 @@ end "parallelism encouraged", "parallelism and batches encouraged", ], - (loc, format) in [ - ("Internet", "generator"), - ("S3", "generator"), - ("S3", "directory") - ], - (max_num_bytes, nimages, add_channelview) in [ - (0, 1, false), - (0, 50, true), - (100_000_000_000, 1, true) + (loc, format, max_num_bytes, nimages, add_channelview) in [ + ("Internet", "generator", 0, 1, false), + ("Internet", "generator", 0, 50, true), + ("Internet", "generator", 100_000_000_000, 1, true), + ("S3", "generator", 100_000_000_000, 1, true), + ("S3", "directory", 0, 1, false), + ("S3", "directory", 0, 50, true), + ("S3", "directory", 100_000_000_000, 1, true) ], shuffled in [true, false] @@ -109,14 +108,9 @@ end df = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true, invalidate_samples=true) sample(df) - @show max_num_bytes - @show exact_sample - @show get_sample_rate(p; add_channelview=add_channelview) configure_sampling(p; sample_rate=50) - @show get_sampling_configs() read_jpg(p; add_channelview=add_channelview) - @show get_sampling_configs() @test get_sample_rate(p; add_channelview=add_channelview) == 50 @test has_metadata(p; add_channelview=add_channelview) @test has_sample(p; add_channelview=add_channelview) @@ -127,14 +121,8 @@ end @test !has_metadata(p; add_channelview=add_channelview) @test !has_sample(p; add_channelview=add_channelview) - @show get_sample_rate(p; add_channelview=add_channelview) df2 = read_jpg(p; add_channelview=add_channelview) - @show Banyan.LocationPath(p; add_channelview=add_channelview) - @show get_sampling_configs() - @show get_sampling_config(p; add_channelview=add_channelview) - @show get_sample_rate(p; add_channelview=add_channelview) sample(df2) - @show get_sample_rate(p; add_channelview=add_channelview) df2 = read_jpg(p; add_channelview=add_channelview, samples_invalid=true) sample(df2) @test get_sample_rate(p; add_channelview=add_channelview) == 50 @@ -143,11 +131,9 @@ end df2 = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true) sample(df2) @test get_sample_rate(p; add_channelview=add_channelview) == 50 - println("Bad get_sample_rate") @test get_sample_rate() == 75 configure_sampling(sample_rate=75, for_all_locations=true) @test get_sample_rate(p; add_channelview=add_channelview) == 50 - println("Bad get_sample_rate") configure_sampling(sample_rate=75, force_new_sample_rate=true, for_all_locations=true) @test get_sample_rate(p; add_channelview=add_channelview) == 75 @test get_sample_rate() == 75 @@ -158,7 +144,6 @@ end sample(df2) @test has_metadata(p; add_channelview=add_channelview) @test has_sample(p; add_channelview=add_channelview) - @show get_sample_rate(p; add_channelview=add_channelview) configure_sampling(p; add_channelview=add_channelview, always_exact=true) sample(df2) end diff --git a/BanyanImages/test/runtests.jl b/BanyanImages/test/runtests.jl index 58b1eb6b..3d650ff1 100644 --- a/BanyanImages/test/runtests.jl +++ b/BanyanImages/test/runtests.jl @@ -107,4 +107,5 @@ finally # Destroy jobs to clean up. # destroy_all_jobs_for_testing() cleanup_s3_test_files(get_cluster_s3_bucket_name(ENV["BANYAN_CLUSTER_NAME"])) + end_all_sessions_for_testing() end \ No newline at end of file From 28d5894b0971803457669ef423827798572fcb6f Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Fri, 19 Aug 2022 08:28:20 -0400 Subject: [PATCH 23/25] Make start_session create cluster if needed and change default # of workers to 150 --- Banyan/src/clusters.jl | 112 +++++++++++++++++++-------------------- Banyan/src/sessions.jl | 117 +++++++++++++++++++++++++++++++++++------ 2 files changed, 158 insertions(+), 71 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index eabb1215..8ec83cc1 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -1,5 +1,5 @@ struct Cluster - name::String + cluster_name::String status::Symbol status_explanation::String s3_bucket_arn::String @@ -21,7 +21,7 @@ end @nospecialize function create_cluster(; - name::Union{String,Nothing} = nothing, + cluster_name::Union{String,Nothing} = nothing, instance_type::Union{String,Nothing} = "m4.4xlarge", max_num_workers::Union{Int,Nothing} = 2048, initial_num_workers::Union{Int,Nothing} = 16, @@ -42,29 +42,29 @@ function create_cluster(; # Configure using parameters c = configure(; kwargs...) - clusters = get_clusters(name; kwargs...) - if isnothing(name) - name = "Cluster " * string(length(clusters) + 1) + clusters = get_clusters(cluster_name; kwargs...) + if isnothing(cluster_name) + cluster_name = "cluster-" * string(length(clusters) + 1) end if isnothing(region) region = get_aws_config_region() end - # Check if the configuration for this cluster name already exists + # Check if the configuration for this cluster cluster_name already exists # If it does, then recreate cluster - if haskey(clusters, name) - if force_create || clusters[name].status == :terminated - @info "Started re-creating cluster named $name" + if haskey(clusters, cluster_name) + if force_create || clusters[cluster_name].status == :terminated + @info "Started re-creating cluster named $cluster_name" send_request_get_response( :create_cluster, - Dict("cluster_name" => name, "recreate" => true, "force_create" => true), + Dict("cluster_name" => cluster_name, "recreate" => true, "force_create" => true), ) if !nowait - wait_for_cluster(name; kwargs...) + wait_for_cluster(cluster_name; kwargs...) end - return get_cluster(name; kwargs...) + return get_cluster(cluster_name; kwargs...) else - error("Cluster with name $name already exists and its current status is $(string(clusters[name].status))") + error("Cluster with cluster_name $cluster_name already exists and its current status is $(string(clusters[cluster_name].status))") end end @@ -82,7 +82,7 @@ function create_cluster(; # Construct cluster creation cluster_config = Dict{String,Any}( - "cluster_name" => name, + "cluster_name" => cluster_name, "instance_type" => instance_type, "max_num_workers" => max_num_workers, "initial_num_workers" => initial_num_workers, @@ -110,60 +110,60 @@ function create_cluster(; cluster_config["subnet_id"] = subnet_id end - @info "Started creating cluster named $name" + @info "Started creating cluster named $cluster_name" # Send request to create cluster send_request_get_response(:create_cluster, cluster_config) if !nowait - wait_for_cluster(name; kwargs...) + wait_for_cluster(cluster_name; kwargs...) end # Cache info - get_cluster(name; kwargs...) + get_cluster(cluster_name; kwargs...) - return get_clusters_dict()[name] + return get_clusters_dict()[cluster_name] end -function destroy_cluster(name::String; kwargs...) +function destroy_cluster(cluster_name::String; kwargs...) configure(; kwargs...) - @info "Destroying cluster named $name" - send_request_get_response(:destroy_cluster, Dict{String,Any}("cluster_name" => name)) + @info "Destroying cluster named $cluster_name" + send_request_get_response(:destroy_cluster, Dict{String,Any}("cluster_name" => cluster_name)) end -function delete_cluster(name::String; kwargs...) +function delete_cluster(cluster_name::String; kwargs...) configure(; kwargs...) - @info "Deleting cluster named $name" + @info "Deleting cluster named $cluster_name" send_request_get_response( :destroy_cluster, - Dict{String,Any}("cluster_name" => name, "permanently_delete" => true), + Dict{String,Any}("cluster_name" => cluster_name, "permanently_delete" => true), ) end -function update_cluster(name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...) +function update_cluster(cluster_name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...) configure(; kwargs...) - @info "Updating cluster named $name" + @info "Updating cluster named $cluster_name" send_request_get_response( :update_cluster, Dict{String, Any}( - "cluster_name" => name, + "cluster_name" => cluster_name, "force_update" => force_update, "update_linux_packages" => update_linux_packages, "reinstall_julia" => reinstall_julia ) ) if !nowait - wait_for_cluster(name) + wait_for_cluster(cluster_name) end end -function assert_cluster_is_ready(name::String; kwargs...) - @info "Setting status of cluster named $name to running" +function assert_cluster_is_ready(cluster_name::String; kwargs...) + @info "Setting status of cluster named $cluster_name to running" # Configure configure(; kwargs...) - send_request_get_response(:set_cluster_ready, Dict{String,Any}("cluster_name" => name)) + send_request_get_response(:set_cluster_ready, Dict{String,Any}("cluster_name" => cluster_name)) end parsestatus(status::String)::Symbol = @@ -197,9 +197,9 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster} end response = send_request_get_response(:describe_clusters, Dict{String,Any}("filters"=>filters)) clusters_dict::Dict{String,Cluster} = Dict{String,Cluster}() - for (name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any} - clusters_dict[name] = Cluster( - name, + for (cluster_name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any} + clusters_dict[cluster_name] = Cluster( + cluster_name, parsestatus(c["status"]::String), haskey(c, "status_explanation") ? c["status_explanation"]::String : "", c["s3_read_write_resource"]::String, @@ -212,16 +212,16 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster} # Cache info curr_clusters_dict = get_clusters_dict() - for (name, c) in clusters_dict - curr_clusters_dict[name] = c + for (cluster_name, c) in clusters_dict + curr_clusters_dict[cluster_name] = c end clusters_dict end -function get_clusters(cluster_name=nothing; kwargs...)::Dict{String,Cluster} +function get_clusters(cluster_name=""; kwargs...)::Dict{String,Cluster} configure(; kwargs...) - _get_clusters(isnothing(cluster_name) ? "" : cluster_name) + _get_clusters(cluster_name) end function get_cluster_s3_bucket_arn(cluster_name=get_cluster_name(); kwargs...) @@ -236,19 +236,19 @@ end get_cluster_s3_bucket_name(cluster_name=get_cluster_name(); kwargs...) = s3_bucket_arn_to_name(get_cluster_s3_bucket_arn(cluster_name; kwargs...)) -get_cluster(name::String=get_cluster_name(); kwargs...)::Cluster = get_clusters(name; kwargs...)[name] +get_cluster(cluster_name::String=get_cluster_name(); kwargs...)::Cluster = get_clusters(cluster_name; kwargs...)[cluster_name] get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...)) -function get_cluster_status(name::String)::Symbol +function get_cluster_status(cluster_name::String)::Symbol clusters_dict = get_clusters_dict() clusters::Dict{String,Cluster} - if haskey(clusters_dict, name) - if clusters_dict[name].status == :failed - @error clusters_dict[name].status_explanation + if haskey(clusters_dict, cluster_name) + if clusters_dict[cluster_name].status == :failed + @error clusters_dict[cluster_name].status_explanation end end - c::Cluster = get_clusters(name)[name] + c::Cluster = get_clusters(cluster_name)[cluster_name] if c.status == :failed @error c.status_explanation end @@ -256,16 +256,16 @@ function get_cluster_status(name::String)::Symbol end get_cluster_status() = get_cluster_status(get_cluster_name()) -function _wait_for_cluster(name::String) +function _wait_for_cluster(cluster_name::String) t::Int64 = 5 - cluster_status::Symbol = get_cluster_status(name) - p::ProgressUnknown = ProgressUnknown("Finding status of cluster $name", enabled=false) + cluster_status::Symbol = get_cluster_status(cluster_name) + p::ProgressUnknown = ProgressUnknown("Finding status of cluster $cluster_name", enabled=false) while (cluster_status == :creating || cluster_status == :updating) if !p.enabled if cluster_status == :creating - p = ProgressUnknown("Setting up cluster $name", spinner=true) + p = ProgressUnknown("Setting up cluster $cluster_name", spinner=true) else - p = ProgressUnknown("Updating cluster $name", spinner=true) + p = ProgressUnknown("Updating cluster $cluster_name", spinner=true) end end sleep(t) @@ -273,28 +273,28 @@ function _wait_for_cluster(name::String) if t < 80 t *= 2 end - cluster_status = get_cluster_status(name) + cluster_status = get_cluster_status(cluster_name) end if p.enabled finish!(p, spinner = (cluster_status == :running ? '✓' : '✗')) end if cluster_status == :running - # @info "Cluster $name is ready" + # @info "Cluster $cluster_name is ready" elseif cluster_status == :terminated - error("Cluster $name no longer exists") + error("Cluster $cluster_name no longer exists") elseif cluster_status != :creating && cluster_status != :updating - error("Failed to set up cluster named $name") + error("Failed to set up cluster named $cluster_name") else - error("Cluster $name has unexpected status: $cluster_status") + error("Cluster $cluster_name has unexpected status: $cluster_status") end end function wait_for_cluster(;kwargs...) configure(;kwargs...) _wait_for_cluster(get_cluster_name()) end -function wait_for_cluster(name::String; kwargs...) +function wait_for_cluster(cluster_name::String; kwargs...) configure(;kwargs...) - _wait_for_cluster(name) + _wait_for_cluster(cluster_name) end function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cluster_name(), kwargs...) diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index dd6cd25d..b9feed7b 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -80,6 +80,7 @@ const NOTHING_STRING = "NOTHING_STRING" function _start_session( cluster_name::String, + c::Cluster, nworkers::Int64, release_resources_after::Integer, print_logs::Bool, @@ -112,17 +113,6 @@ function _start_session( ) global session_sampling_configs - # Construct parameters for starting session - cluster_name = if cluster_name == NOTHING_STRING - running_clusters = get_running_clusters() - if length(running_clusters) == 0 - error("Failed to start session: you don't have any clusters created") - end - first(keys(running_clusters)) - else - cluster_name - end - version = get_julia_version() not_in_modules = m -> !(m in not_using_modules) @@ -151,7 +141,7 @@ function _start_session( if !no_email session_configuration["email_when_ready"] = email_when_ready end - c::Cluster = get_cluster(cluster_name) + s3_bucket_name = s3_bucket_arn_to_name(c.s3_bucket_arn) organization_id = c.organization_id curr_cluster_instance_id = c.curr_cluster_instance_id @@ -292,9 +282,105 @@ function _start_session( session_id end +function start_session_with_cluster( + cluster_name::String, + nworkers::Int64, + release_resources_after::Integer, + print_logs::Bool, + store_logs_in_s3::Bool, + store_logs_on_cluster::Bool, + log_initialization::Bool, + session_name::String, + files::Vector{String}, + code_files::Vector{String}, + force_update_files::Bool, + pf_dispatch_table::Vector{String}, + no_pf_dispatch_table::Bool, + using_modules::Vector{String}, + # We currently can't use modules that require GUI + not_using_modules::Vector{String}, + url::String, + branch::String, + directory::String, + dev_paths::Vector{String}, + force_sync::Bool, + force_pull::Bool, + force_install::Bool, + estimate_available_memory::Bool, + nowait::Bool, + email_when_ready::Bool, + no_email::Bool, + for_running::Bool, + sessions::Dict{String,Session}, + sampling_configs::Dict{LocationPath,SamplingConfig}, + kwargs... +) + # Construct parameters for starting session + cluster_name::String, c::Cluster = if cluster_name == NOTHING_STRING + running_clusters = get_running_clusters() + if isempty(running_clusters) + new_c = create_cluster(; + nowait=false, + initial_num_workers=nworkers, + kwargs... + ) + new_c.cluster_name, new_c + else + first(running_clusters) + end + else + c_dict::Dict{String,Cluster} = get_running_clusters(cluster_name) + cluster_name, if haskey(c_dict, cluster_name) + c_dict[cluster_name] + else + create_cluster(; + cluster_name=cluster_name, + nowait=false, + initial_num_workers=nworkers, + kwargs... + ) + end + end + + _start_session( + cluster_name::String, + c::Cluster, + nworkers::Int64, + release_resources_after::Integer, + print_logs::Bool, + store_logs_in_s3::Bool, + store_logs_on_cluster::Bool, + log_initialization::Bool, + session_name::String, + files::Vector{String}, + code_files::Vector{String}, + force_update_files::Bool, + pf_dispatch_table::Vector{String}, + no_pf_dispatch_table::Bool, + using_modules::Vector{String}, + # We currently can't use modules that require GUI + not_using_modules::Vector{String}, + url::String, + branch::String, + directory::String, + dev_paths::Vector{String}, + force_sync::Bool, + force_pull::Bool, + force_install::Bool, + estimate_available_memory::Bool, + nowait::Bool, + email_when_ready::Bool, + no_email::Bool, + for_running::Bool, + sessions::Dict{String,Session}, + sampling_configs::Dict{LocationPath,SamplingConfig} + ) +end + function start_session(; cluster_name::String = NOTHING_STRING, - nworkers::Int64 = 16, + # Default 100x speedup + nworkers::Int64 = 150, release_resources_after::Union{Integer,Nothing} = 20, print_logs::Bool = false, store_logs_in_s3::Bool = true, @@ -334,7 +420,7 @@ function start_session(; configure(; kwargs...) configure_sampling(; kwargs...) - current_session_id = _start_session( + current_session_id = start_session_with_cluster( cluster_name, nworkers, isnothing(release_resources_after) ? -1 : release_resources_after, @@ -364,7 +450,8 @@ function start_session(; isnothing(email_when_ready), for_running, sessions, - get_sampling_configs() + get_sampling_configs(), + kwargs... ) current_session_id end From a99563855b4bd8586b6ea69911dcddcf83f8d53f Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Fri, 19 Aug 2022 12:39:16 -0400 Subject: [PATCH 24/25] Add automatic destruction of idle clusters --- Banyan/src/clusters.jl | 4 +++- Banyan/src/sessions.jl | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 8ec83cc1..1d9ee6d9 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -36,6 +36,7 @@ function create_cluster(; subnet_id = nothing, nowait=false, force_create=false, + destroy_cluster_after = -1, kwargs..., ) @@ -95,7 +96,8 @@ function create_cluster(; # by size of 1 GiB and then round up. Then the backend will determine how to adjust the # disk capacity to an allowable increment (e.g., 1200 GiB or an increment of 2400 GiB # for AWS FSx Lustre filesystems) - "disk_capacity" => disk_capacity == "auto" ? -1 : ceil(Int64, parse_bytes(disk_capacity) / 1.073741824e7) + "disk_capacity" => disk_capacity == "auto" ? -1 : ceil(Int64, parse_bytes(disk_capacity) / 1.073741824e7), + "destroy_cluster_after" => destroy_cluster_after ) if haskey(c["aws"], "ec2_key_pair_name") cluster_config["ec2_key_pair"] = c["aws"]["ec2_key_pair_name"] diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index b9feed7b..5a35c4c2 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -319,9 +319,12 @@ function start_session_with_cluster( cluster_name::String, c::Cluster = if cluster_name == NOTHING_STRING running_clusters = get_running_clusters() if isempty(running_clusters) + # If the user is not separately creating a cluster, we should + # by default destroy it after 12 hours. new_c = create_cluster(; nowait=false, initial_num_workers=nworkers, + destroy_after=(12 * 60) kwargs... ) new_c.cluster_name, new_c @@ -337,6 +340,7 @@ function start_session_with_cluster( cluster_name=cluster_name, nowait=false, initial_num_workers=nworkers, + destroy_after=(12 * 60), kwargs... ) end From 045f2e01ab1a93268b84484c705fe800fef97dff Mon Sep 17 00:00:00 2001 From: Caleb Winston Date: Mon, 22 Aug 2022 06:15:39 -0700 Subject: [PATCH 25/25] Implement all changes to make starting sessions lazy --- Banyan/src/clusters.jl | 54 ++- Banyan/src/requests.jl | 7 +- Banyan/src/samples.jl | 3 +- Banyan/src/sessions.jl | 364 ++++++++++------ Banyan/test/Project.toml | 1 - Banyan/test/clusters.jl | 6 +- Banyan/test/run_session_test_script.jl | 1 + Banyan/test/runtests.jl | 2 - Banyan/test/sessions.jl | 581 ++++++++++++++----------- BanyanArrays/test/Project.toml | 2 +- BanyanHDF5/test/Project.toml | 4 +- BanyanHDF5/test/runtests.jl | 2 + BanyanImages/test/Project.toml | 2 +- 13 files changed, 608 insertions(+), 421 deletions(-) create mode 100644 Banyan/test/run_session_test_script.jl diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 1d9ee6d9..c74cc0b0 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -34,9 +34,10 @@ function create_cluster(; region = nothing, vpc_id = nothing, subnet_id = nothing, - nowait=false, + wait_now=true, force_create=false, destroy_cluster_after = -1, + show_progress = true, kwargs..., ) @@ -55,17 +56,24 @@ function create_cluster(; # If it does, then recreate cluster if haskey(clusters, cluster_name) if force_create || clusters[cluster_name].status == :terminated - @info "Started re-creating cluster named $cluster_name" + if show_progress + @info "Started re-creating cluster named $cluster_name" + end send_request_get_response( :create_cluster, Dict("cluster_name" => cluster_name, "recreate" => true, "force_create" => true), ) - if !nowait + if wait_now + wait_for_cluster(cluster_name; kwargs...) + end + return get_cluster(cluster_name; kwargs...) + elseif clusters[cluster_name].status == :creating + if wait_now wait_for_cluster(cluster_name; kwargs...) end return get_cluster(cluster_name; kwargs...) else - error("Cluster with cluster_name $cluster_name already exists and its current status is $(string(clusters[cluster_name].status))") + error("Cluster with name $cluster_name already exists and its current status is $(string(clusters[cluster_name].status))") end end @@ -112,12 +120,14 @@ function create_cluster(; cluster_config["subnet_id"] = subnet_id end - @info "Started creating cluster named $cluster_name" + if show_progress + @info "Started creating cluster named $cluster_name" + end # Send request to create cluster send_request_get_response(:create_cluster, cluster_config) - if !nowait + if wait_now wait_for_cluster(cluster_name; kwargs...) end @@ -131,6 +141,7 @@ function destroy_cluster(cluster_name::String; kwargs...) configure(; kwargs...) @info "Destroying cluster named $cluster_name" send_request_get_response(:destroy_cluster, Dict{String,Any}("cluster_name" => cluster_name)) + ; end function delete_cluster(cluster_name::String; kwargs...) @@ -140,9 +151,10 @@ function delete_cluster(cluster_name::String; kwargs...) :destroy_cluster, Dict{String,Any}("cluster_name" => cluster_name, "permanently_delete" => true), ) + ; end -function update_cluster(cluster_name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...) +function update_cluster(cluster_name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, wait_now=true, kwargs...) configure(; kwargs...) @info "Updating cluster named $cluster_name" send_request_get_response( @@ -154,9 +166,10 @@ function update_cluster(cluster_name::String; force_update=false, update_linux_p "reinstall_julia" => reinstall_julia ) ) - if !nowait + if wait_now wait_for_cluster(cluster_name) end + ; end function assert_cluster_is_ready(cluster_name::String; kwargs...) @@ -166,6 +179,7 @@ function assert_cluster_is_ready(cluster_name::String; kwargs...) configure(; kwargs...) send_request_get_response(:set_cluster_ready, Dict{String,Any}("cluster_name" => cluster_name)) + ; end parsestatus(status::String)::Symbol = @@ -258,26 +272,28 @@ function get_cluster_status(cluster_name::String)::Symbol end get_cluster_status() = get_cluster_status(get_cluster_name()) -function _wait_for_cluster(cluster_name::String) +function _wait_for_cluster(cluster_name::String, show_progress::Bool) t::Int64 = 5 cluster_status::Symbol = get_cluster_status(cluster_name) - p::ProgressUnknown = ProgressUnknown("Finding status of cluster $cluster_name", enabled=false) + p::ProgressUnknown = ProgressUnknown("Finding status of cluster $cluster_name", enabled=show_progress) while (cluster_status == :creating || cluster_status == :updating) - if !p.enabled + if show_progress && !p.enabled if cluster_status == :creating - p = ProgressUnknown("Setting up cluster $cluster_name", spinner=true) + p = ProgressUnknown("Setting up cluster $cluster_name", spinner=true, enabled=show_progress) else - p = ProgressUnknown("Updating cluster $cluster_name", spinner=true) + p = ProgressUnknown("Updating cluster $cluster_name", spinner=true, enabled=show_progress) end end sleep(t) - next!(p) + if show_progress + next!(p) + end if t < 80 t *= 2 end cluster_status = get_cluster_status(cluster_name) end - if p.enabled + if show_progress finish!(p, spinner = (cluster_status == :running ? '✓' : '✗')) end if cluster_status == :running @@ -290,13 +306,13 @@ function _wait_for_cluster(cluster_name::String) error("Cluster $cluster_name has unexpected status: $cluster_status") end end -function wait_for_cluster(;kwargs...) +function wait_for_cluster(show_progress=true; kwargs...) configure(;kwargs...) - _wait_for_cluster(get_cluster_name()) + _wait_for_cluster(get_cluster_name(), show_progress) end -function wait_for_cluster(cluster_name::String; kwargs...) +function wait_for_cluster(cluster_name::String, show_progress=true; kwargs...) configure(;kwargs...) - _wait_for_cluster(cluster_name) + _wait_for_cluster(cluster_name, show_progress) end function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cluster_name(), kwargs...) diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl index 8578786d..36218d52 100644 --- a/Banyan/src/requests.jl +++ b/Banyan/src/requests.jl @@ -236,7 +236,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n # There are two cases: either we # TODO: Maybe we don't need to wait_For_session - # There is a problem where we start a session with nowait=true and then it + # There is a problem where we start a session with wait_now=false and then it # reuses a resource that is in a creating state. Since the session is still # creating and we have not yet waited for it to start, if we have # `estimate_available_memory=false` then we will end up with job info not @@ -524,7 +524,7 @@ end function send_evaluation(value_id::ValueId, session_id::SessionId) # First we ensure that the session is ready. This way, we can get a good # estimate of available worker memory before calling evaluate. - wait_for_session(session_id) + session = get_session(session_id) encourage_parallelism = get_encourage_parallelism() encourage_parallelism_with_batches = get_encourage_parallelism_with_batches() @@ -690,12 +690,11 @@ function offloaded(given_function::Function, args...; distributed::Bool = false) # We must wait for session because otherwise we will slurp up the session # ready message on the gather queue. - wait_for_session(session_id) + session = get_session(session_id) # job_id = Banyan.get_job_id() p = ProgressUnknown("Running offloaded code", spinner=true) - session = get_session() gather_queue = gather_queue_url() stored_res = nothing error_for_main_stuck, error_for_main_stuck_time = nothing, nothing diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl index 2ce34517..66df1558 100644 --- a/Banyan/src/samples.jl +++ b/Banyan/src/samples.jl @@ -1,5 +1,6 @@ function configure_sampling( path=""; + nworkers=nothing, sample_rate=nothing, always_exact=nothing, max_num_bytes_exact=nothing, @@ -13,7 +14,7 @@ function configure_sampling( sc = default ? DEFAULT_SAMPLING_CONFIG : get_sampling_config(path; kwargs...) nsc = SamplingConfig( - (!isnothing(sample_rate)) ? sample_rate : sc.rate, + (!isnothing(sample_rate)) ? sample_rate : (!isnothing(nworkers) ? (nworkers * 8) : sc.rate), (!isnothing(always_exact)) ? always_exact : sc.always_exact, (!isnothing(max_num_bytes_exact)) ? max_num_bytes_exact : sc.max_num_bytes_exact, (!isnothing(force_new_sample_rate)) ? force_new_sample_rate : sc.force_new_sample_rate, diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl index 5a35c4c2..d92a63ec 100644 --- a/Banyan/src/sessions.jl +++ b/Banyan/src/sessions.jl @@ -16,6 +16,9 @@ global sessions = Dict{SessionId,Session}() # ergonomic. global current_session_id = "" +# Tasks for starting sessions +global start_session_tasks = Dict{SessionId,Task}() + function set_session(session_id::SessionId) global current_session_id current_session_id = session_id @@ -23,17 +26,49 @@ end function _get_session_id_no_error()::SessionId global current_session_id - current_session_id + global sessions + !haskey(sessions, current_session_id) ? "" : current_session_id end -function get_session_id()::SessionId +function get_session_id(session_id="")::SessionId global current_session_id - if isempty(current_session_id) - error( - "No session started or selected using `start_session` or `with_session` or `set_session`. The current session may have been destroyed or no session started yet.", - ) + global sessions + global start_session_tasks + global session_sampling_configs + + if isempty(session_id) + session_id = current_session_id + end + + if haskey(sessions, session_id) + session_id + elseif haskey(start_session_tasks, session_id) + start_session_task = start_session_tasks[session_id] + if istaskdone(start_session_task) && length(start_session_task.result) == 2 + e, bt = start_session_task.result + showerror(stderr, e, bt) + error("Failed to start session with ID $session_id") + session_id + elseif istaskdone(start_session_task) && length(start_session_task.result) == 3 + new_session_id, session, sampling_configs = start_session_task.result + sessions[new_session_id] = session + session_sampling_configs[new_session_id] = sampling_configs + if session_id == current_session_id + current_session_id = new_session_id + end + new_session_id + else + # Otherwise, the task is still running or hasn't yet been started + # in which case we will just return the ID of the start_session task + session_id + end + elseif isempty(session_id) + start_session() + elseif startswith(session_id, "start-session-") + error("The session with ID $session_id was not created in this Julia session") + else + session_id end - current_session_id end function get_sessions_dict()::Dict{SessionId,Session} @@ -41,19 +76,46 @@ function get_sessions_dict()::Dict{SessionId,Session} sessions end -function get_session()::Session - session_id = get_session_id() +function get_session(session_id=get_session_id(), show_progress=true)::Session sessions_dict = get_sessions_dict() - if !haskey(sessions_dict, session_id) - error("The selected session does not have any information; if it was started by this process, it has either failed or been destroyed.") + global start_session_tasks + if haskey(sessions_dict, session_id) + sessions_dict[session_id] + elseif haskey(start_session_tasks, session_id) + # Schedule the task if not yet scheduled + start_session_task = start_session_tasks[session_id] + if !istaskstarted(start_session_task) + yield(start_session_task) + end + + # Keep looping till the task is created + + p = ProgressUnknown("Preparing session with ID $session_id", spinner=true, enabled=show_progress) + try + while !haskey(get_sessions_dict(), get_session_id(session_id)) + if p.enabled + next!(p) + end + end + catch e + if p.enabled + finish!(p, spinner = '✗') + end + rethrow() + end + if p.enabled + finish!(p, spinner = '✓') + end + get_sessions_dict()[get_session_id(session_id)] + else + error("The current session ID $session_id is not stored as a session starting task in progress or a running session") end - sessions_dict[session_id] end get_cluster_name()::String = get_session().cluster_name function get_loaded_packages() - global current_session_id + current_session_id = _get_session_id_no_error() loaded_packages::Set{String} = if !isempty(current_session_id) get_sessions_dict()[current_session_id].loaded_packages else @@ -78,6 +140,8 @@ end const NOTHING_STRING = "NOTHING_STRING" +const StartSessionResult = Tuple{SessionId,Session,Dict{LocationPath,SamplingConfig}} + function _start_session( cluster_name::String, c::Cluster, @@ -104,13 +168,12 @@ function _start_session( force_pull::Bool, force_install::Bool, estimate_available_memory::Bool, - nowait::Bool, email_when_ready::Bool, no_email::Bool, for_running::Bool, sessions::Dict{String,Session}, sampling_configs::Dict{LocationPath,SamplingConfig} -) +)::StartSessionResult global session_sampling_configs version = get_julia_version() @@ -254,9 +317,9 @@ function _start_session( end message end - @info msg + # @info msg # Store in global state - sessions[session_id] = Session( + new_session = Session( cluster_name, session_id, resource_id, @@ -270,16 +333,15 @@ function _start_session( gather_queue_url=gather_queue_url, execution_queue_url=execution_queue_url ) - session_sampling_configs[session_id] = sampling_configs - if !nowait - wait_for_session(session_id) - elseif !reusing_resources - @warn "Starting this session requires creating new cloud computing resources which will take 10-30 minutes for the first computation." - end + # if !nowait + wait_for_session(session_id, false) + # elseif !reusing_resources + # @warn "Starting this session requires creating new cloud computing resources which will take 10-30 minutes for the first computation." + # end @debug "Finished call to start_session with ID $session_id" - session_id + session_id, new_session, sampling_configs end function start_session_with_cluster( @@ -307,14 +369,13 @@ function start_session_with_cluster( force_pull::Bool, force_install::Bool, estimate_available_memory::Bool, - nowait::Bool, email_when_ready::Bool, no_email::Bool, for_running::Bool, sessions::Dict{String,Session}, sampling_configs::Dict{LocationPath,SamplingConfig}, kwargs... -) +)::StartSessionResult # Construct parameters for starting session cluster_name::String, c::Cluster = if cluster_name == NOTHING_STRING running_clusters = get_running_clusters() @@ -322,9 +383,10 @@ function start_session_with_cluster( # If the user is not separately creating a cluster, we should # by default destroy it after 12 hours. new_c = create_cluster(; - nowait=false, + wait_now=true, initial_num_workers=nworkers, - destroy_after=(12 * 60) + destroy_after=(12 * 60), + show_progress=false, kwargs... ) new_c.cluster_name, new_c @@ -338,9 +400,10 @@ function start_session_with_cluster( else create_cluster(; cluster_name=cluster_name, - nowait=false, + wait_now=true, initial_num_workers=nworkers, destroy_after=(12 * 60), + show_progress=false, kwargs... ) end @@ -372,7 +435,6 @@ function start_session_with_cluster( force_pull::Bool, force_install::Bool, estimate_available_memory::Bool, - nowait::Bool, email_when_ready::Bool, no_email::Bool, for_running::Bool, @@ -384,7 +446,7 @@ end function start_session(; cluster_name::String = NOTHING_STRING, # Default 100x speedup - nworkers::Int64 = 150, + nworkers::Int64 = -1, release_resources_after::Union{Integer,Nothing} = 20, print_logs::Bool = false, store_logs_in_s3::Bool = true, @@ -406,9 +468,10 @@ function start_session(; force_pull::Bool = false, force_install::Bool = false, estimate_available_memory::Bool = true, - nowait::Bool = true, email_when_ready::Union{Bool,Nothing} = nothing, for_running::Bool = false, + start_now::Bool = false, + wait_now::Bool = false, kwargs..., )::SessionId # Should save 5ms of overhead @@ -418,60 +481,91 @@ function start_session(; global BANYAN_JULIA_PACKAGES sessions = get_sessions_dict() - global current_session_id + global start_session_tasks # Configure configure(; kwargs...) - configure_sampling(; kwargs...) + nworkers = nworkers == -1 ? (is_debug_on() ? 2 : 150) : nworkers + configure_sampling(; nworkers=nworkers, kwargs...) - current_session_id = start_session_with_cluster( - cluster_name, - nworkers, - isnothing(release_resources_after) ? -1 : release_resources_after, - print_logs, - store_logs_in_s3, - store_logs_on_cluster, - log_initialization, - session_name, - files, - code_files, - force_update_files, - isnothing(pf_dispatch_table) ? String[] : pf_dispatch_table, - isnothing(pf_dispatch_table), - using_modules, - # We currently can't use modules that require GUI - not_using_modules, - url, - branch, - directory, - dev_paths, - force_sync, - force_pull, - force_install, - estimate_available_memory, - nowait, - isnothing(email_when_ready) ? false : email_when_ready, - isnothing(email_when_ready), - for_running, - sessions, - get_sampling_configs(), - kwargs... - ) - current_session_id + # Create task for starting session + new_start_session_task_id = "start-session-$(length(start_session_tasks) + 1)" + new_start_session_task = + Task( + () -> try + start_session_with_cluster( + cluster_name, + nworkers, + isnothing(release_resources_after) ? -1 : release_resources_after, + print_logs, + store_logs_in_s3, + store_logs_on_cluster, + log_initialization, + session_name, + files, + code_files, + force_update_files, + isnothing(pf_dispatch_table) ? String[] : pf_dispatch_table, + isnothing(pf_dispatch_table), + using_modules, + # We currently can't use modules that require GUI + not_using_modules, + url, + branch, + directory, + dev_paths, + force_sync, + force_pull, + force_install, + estimate_available_memory, + isnothing(email_when_ready) ? false : email_when_ready, + isnothing(email_when_ready), + for_running, + sessions, + get_sampling_configs(), + kwargs... + ) + catch e + bt = catch_backtrace() + (e, bt) + end + ) + start_session_tasks[new_start_session_task_id] = new_start_session_task + set_session(new_start_session_task_id) + + # Start now or wait now if requested + if start_now || wait_now + yield(new_start_session_task) + end + if wait_now + get_session(new_start_session_task_id) + end + + # Return the current session ID + get_session_id() end -function end_session(session_id::SessionId = get_session_id(); failed = false, release_resources_now = false, release_resources_after = nothing, kwargs...) +function end_session(session_id::SessionId = get_session_id(); failed = false, release_resources_now = false, release_resources_after = nothing, destroy_cluster=false, kwargs...) sessions = get_sessions_dict() global current_session_id + global start_session_tasks # Configure using parameters configure(; kwargs...) + # Ensure that the session ID is not of a creating task + # TODO: Get the session ID before the task begins wait_for_session + # so that it can be ended sooner. (maybe use local storage of the task) + if haskey(start_session_tasks, session_id) + @warn "Session with ID $session_id must be started before it can be destroyed" + session_id = get_session(session_id).id + end + request_params = Dict{String,Any}("session_id" => session_id, "failed" => failed, "release_resources_now" => release_resources_now) if !isnothing(release_resources_after) request_params["release_resources_after"] = release_resources_after end - send_request_get_response( + resp = send_request_get_response( :end_session, request_params, ) @@ -480,6 +574,16 @@ function end_session(session_id::SessionId = get_session_id(); failed = false, r # Remove from global state set_session("") delete!(sessions, session_id) + + # Destroy cluster if desired + if destroy_cluster + if isnothing(resp) || !haskey(resp, "cluster_name") + @warn "Unable to destroy cluster for session with ID $session_id" + else + destroy_cluster(resp["cluster_name"]) + end + end + session_id end @@ -573,6 +677,7 @@ get_running_sessions(args...; kwargs...) = get_sessions(args...; status="running function download_session_logs(session_id::SessionId, cluster_name::String, filename::Union{String,Nothing}=nothing; kwargs...) @debug "Downloading logs for session" configure(; kwargs...) + session_id = get_session_id(session_id) s3_bucket_name = get_cluster_s3_bucket_name(cluster_name; kwargs...) log_file_name = "banyan-log-for-session-$(session_id)" if isnothing(filename) & !isdir(joinpath(homedir(), ".banyan", "logs")) @@ -584,7 +689,9 @@ function download_session_logs(session_id::SessionId, cluster_name::String, file return filename end -function print_session_logs(session_id, cluster_name, delete_file=true) +function print_session_logs(session_id, cluster_name, delete_file=true; kwargs...) + configure(; kwargs...) + session_id = get_session_id(session_id) s3_bucket_name = get_cluster_s3_bucket_name(cluster_name) log_file_name = "banyan-log-for-session-$(session_id)" logs = s3_get(global_aws_config(), s3_bucket_name, log_file_name) @@ -614,8 +721,12 @@ function end_all_sessions(cluster_name::String; release_resources_now = false, r end end -function get_session_status(session_id::String=get_session_id(); kwargs...)::String +function get_session_status(session_id::String=_get_session_id_no_error(); kwargs...)::String + global start_session_tasks sessions = get_sessions_dict() + if !haskey(sessions, session_id) && haskey(start_session_tasks, session_id) && !istaskdone(start_session_tasks[session_id]) + return :creating + end configure(; kwargs...) filters = Dict{String,Any}("session_id" => session_id) params = Dict{String,Any}("filters"=>filters) @@ -640,10 +751,10 @@ function get_session_status(session_id::String=get_session_id(); kwargs...)::Str session_status end -function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...) +function _wait_for_session(session_id::SessionId, show_progress; kwargs...) sessions_dict = get_sessions_dict() session_status = get_session_status(session_id; kwargs...) - p = ProgressUnknown("Preparing session with ID $session_id", spinner=true) + p = ProgressUnknown("Preparing session with ID $session_id", spinner=true, enabled=show_progress) t = 0 st = time() while session_status == "creating" @@ -653,10 +764,14 @@ function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...) else 7 end - next!(p) + if p.enabled + next!(p) + end session_status = get_session_status(session_id; kwargs...) end - finish!(p, spinner = session_status == "running" ? '✓' : '✗') + if p.enabled + finish!(p, spinner = session_status == "running" ? '✓' : '✗') + end if session_status == "running" @debug "Session with ID $session_id is ready" if haskey(sessions_dict, session_id) @@ -671,26 +786,33 @@ function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...) end end -function wait_for_session(session_id::SessionId=get_session_id(); kwargs...) +function wait_for_session(session_id::SessionId=get_session_id(), show_progress=true; kwargs...) + global start_session_tasks sessions_dict = get_sessions_dict() - is_session_ready = if haskey(sessions_dict, session_id) - session_info::Session = sessions_dict[session_id] - if !session_info.is_cluster_ready - wait_for_cluster(session_info.cluster_name, kwargs...) - end - session_info.is_session_ready + + if haskey(start_session_tasks, session_id) + get_session(session_id, show_progress) else - false - end - if !is_session_ready - _wait_for_session(session_id; kwargs...) + is_session_ready = if haskey(sessions_dict, session_id) + session_info::Session = sessions_dict[session_id] + if !session_info.is_cluster_ready + wait_for_cluster(session_info.cluster_name, show_progress, kwargs...) + end + session_info.is_session_ready + else + false + end + if !is_session_ready + _wait_for_session(session_id, show_progress; kwargs...) + end end + ; end function with_session(f::Function; kwargs...) # This is not a constructor; this is just a function that ensures that # every session is always destroyed even in the case of an error - use_existing_session = :session in keys(kwargs) + use_existing_session = haskey(kwargs, :session) end_session_on_error = get(kwargs, :end_session_on_error, true)::Bool end_session_on_exit = get(kwargs, :end_session_on_exit, true)::Bool j = use_existing_session ? kwargs[:session] : start_session(; kwargs...) @@ -716,71 +838,49 @@ function with_session(f::Function; kwargs...) end end - -function run_session(; - cluster_name::String = NOTHING_STRING, - nworkers::Int64 = 16, - release_resources_after::Union{Integer,Nothing} = 20, +function run_session(code_files::Union{String,Vector{String}}; print_logs::Bool = false, store_logs_in_s3::Bool = true, - store_logs_on_cluster::Bool = false, - sample_rate::Int64 = nworkers, - session_name::String = NOTHING_STRING, - files::Vector{String} = String[], - code_files::Vector{String} = String[], - force_update_files::Bool = true, - pf_dispatch_table::Union{Vector{String},Nothing} = nothing, - using_modules::Vector{String} = String[], - url::String = NOTHING_STRING, - branch::String = NOTHING_STRING, - directory::String = NOTHING_STRING, - dev_paths::Vector{String} = String[], - force_sync::Bool = false, - force_pull::Bool = false, - force_install::Bool = false, - estimate_available_memory::Bool = true, - email_when_ready::Union{Bool,Nothing}=nothing, kwargs...,)::SessionId - force_update_files = true store_logs_in_s3_orig = store_logs_in_s3 + cluster_name = "" try if print_logs # If logs need to be printed, ensure that we save logs in S3. If # store_logs_in_s3==False, then delete logs in S3 later store_logs_in_s3 = true end - start_session(;cluster_name = cluster_name, nworkers = nworkers, release_resources_after = release_resources_after, - print_logs = print_logs, store_logs_in_s3 = store_logs_in_s3, store_logs_on_cluster = store_logs_on_cluster, - sample_rate = sample_rate, session_name = session_name, files = files, code_files = code_files, force_update_files = force_update_files, - pf_dispatch_table = pf_dispatch_table, using_modules = using_modules, url = url, branch = branch, - directory = directory, dev_paths = dev_paths, force_sync = force_sync, force_pull = force_pull, force_install = force_install, - estimate_available_memory = estimate_available_memory, nowait = false, email_when_ready = email_when_ready, for_running = true) + s = start_session(; + print_logs = print_logs, + store_logs_in_s3 = store_logs_in_s3, + wait_now = true, + for_running = true, + force_update_files = true, + code_files = code_files isa String ? String[code_files] : code_files, + kwargs... + ) + cluster_name = get_session().cluster_name + s catch - session_id = try - get_session_id() - catch - nothing - end - if !isnothing(session_id) + session_id = _get_session_id_no_error() + if !isempty(session_id) end_session(session_id, failed=true, release_resources_now=true) - if print_logs + if print_logs && !isempty(cluster_name) print_session_logs(session_id, cluster_name, !store_logs_in_s3_orig) end end rethrow() + session_id finally - session_id = try - get_session_id() - catch - nothing - end - if !isnothing(session_id) + session_id = _get_session_id_no_error() + if !isempty(session_id) end_session(session_id, failed=false, release_resources_now=true) - if print_logs + if print_logs && !isempty(cluster_name) print_session_logs(session_id, cluster_name, !store_logs_in_s3_orig) end - end + end + session_id end end diff --git a/Banyan/test/Project.toml b/Banyan/test/Project.toml index 61c14273..7fd73758 100644 --- a/Banyan/test/Project.toml +++ b/Banyan/test/Project.toml @@ -3,7 +3,6 @@ AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598" AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" -Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index 61af636e..bea02fc4 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -63,7 +63,7 @@ end name=cluster_name, instance_type="t3.large", s3_bucket_name=s3_bucket, - nowait=true + wait_now=false ) sleep(30) # Just to ensure that cluster creation has initiated s3_bucket_name = get_cluster_s3_bucket_name(cluster_name) @@ -85,7 +85,7 @@ end end c_r = create_cluster( name=cluster_name, - nowait=true + wait_now=false ) s3_bucket_name_r = get_cluster_s3_bucket_name(cluster_name) s3_bucket_exists = bucket_exists(s3_bucket_name_r) @@ -99,7 +99,7 @@ end @test !s3_bucket_exists # Check that the cluster cannot be created again - @test_throws ErrorException create_cluster(name=cluster_name, nowait=true) + @test_throws ErrorException create_cluster(name=cluster_name, wait_now=false) end @testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [ diff --git a/Banyan/test/run_session_test_script.jl b/Banyan/test/run_session_test_script.jl new file mode 100644 index 00000000..0b817e83 --- /dev/null +++ b/Banyan/test/run_session_test_script.jl @@ -0,0 +1 @@ +@show get_worker_idx() \ No newline at end of file diff --git a/Banyan/test/runtests.jl b/Banyan/test/runtests.jl index 36ea9c3b..81139ec2 100644 --- a/Banyan/test/runtests.jl +++ b/Banyan/test/runtests.jl @@ -16,8 +16,6 @@ end function use_session_for_testing( f::Function; nworkers = parse(Int64, get(ENV, "BANYAN_NWORKERS", "2")), - sample_rate = 2, - nworkers = 2, scheduling_config_name = "default scheduling", ) haskey(ENV, "BANYAN_CLUSTER_NAME") || error( diff --git a/Banyan/test/sessions.jl b/Banyan/test/sessions.jl index 738f9837..55d13b97 100644 --- a/Banyan/test/sessions.jl +++ b/Banyan/test/sessions.jl @@ -1,282 +1,353 @@ -# Tests for Sessions: -# Start a session that creates a new job -# Start a session that reuses a job -# Previous session was successfully ended (by calling end_session with delayed destruction) -# Previous session had a session failure - -@testset "Get sessions with status $status" for status in [ - "all", - "creating", - "running", - "failed", - "completed", - "invalid_status" -] - cluster_name = ENV["BANYAN_CLUSTER_NAME"] - - if status == "all" - sessions = get_sessions(cluster_name) - else - filtered_sessions = get_sessions(cluster_name, status=status) - @test all(s -> s[2]["status"] == status, filtered_sessions) - end -end - -@testset "Get running sessions" begin - # Start a session - Pkg.activate("./") - cluster_name = ENV["BANYAN_CLUSTER_NAME"] - - session_id = start_session(cluster_name=cluster_name, nworkers=2) - running_sessions = get_running_sessions(cluster_name) - end_session(session_id, release_resources_now=true) - sessions = get_sessions(cluster_name) - - @test all(s -> s[2]["status"] == "running", running_sessions) - @test any(s -> s[1] == session_id, running_sessions) - @test any(s -> (s[1] == session_id && s[2]["status"] == "completed"), sessions) -end - -# Test that starting a second session after one has been ended -# reuses the same job, if the parameters match. -@testset "Start and end multiple sessions" begin - # Pkg.activate("envs/DataAnalysisProject/") - Pkg.activate("./") - cluster_name = ENV["BANYAN_CLUSTER_NAME"] - delay_time = 5 +# # Tests for Sessions: +# # Start a session that creates a new job +# # Start a session that reuses a job +# # Previous session was successfully ended (by calling end_session with delayed destruction) +# # Previous session had a session failure + +# @testset "Get sessions with status $status" for status in [ +# "all", +# "creating", +# "running", +# "failed", +# "completed", +# "invalid_status" +# ] +# cluster_name = ENV["BANYAN_CLUSTER_NAME"] - # Start a session and end it - session_id_1 = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - force_synce = true, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - release_resources_after=delay_time - ) - resource_id_1 = get_session().resource_id - session_status = get_session_status(session_id_1) - @test session_status == "running" +# if status == "all" +# sessions = get_sessions(cluster_name) +# else +# filtered_sessions = get_sessions(cluster_name, status=status) +# @test all(s -> s[2]["status"] == status, filtered_sessions) +# end +# end - end_session(session_id_1) - sleep(60) # To ensure session gets ended - session_status = get_session_status(session_id_1) - @test session_status == "completed" +# @testset "Get running sessions" begin +# # Start a session +# Pkg.activate("./") +# cluster_name = ENV["BANYAN_CLUSTER_NAME"] + +# session_id = start_session(cluster_name=cluster_name, nworkers=2) +# running_sessions = get_running_sessions(cluster_name) +# end_session(session_id, release_resources_now=true) +# sessions = get_sessions(cluster_name) + +# @test all(s -> s[2]["status"] == "running", running_sessions) +# @test any(s -> s[1] == session_id, running_sessions) +# @test any(s -> (s[1] == session_id && s[2]["status"] == "completed"), sessions) +# end - # Start another session with same nworkers and verify the job ID matches - session_id_2 = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - release_resources_after=delay_time - ) - resource_id_2 = get_session().resource_id - session_status = get_session_status(session_id_2) - @test session_status == "running" - @test resource_id_2 == resource_id_1 # it should have reused resource +# # Test that starting a second session after one has been ended +# # reuses the same job, if the parameters match. +# @testset "Start and end multiple sessions" begin +# # Pkg.activate("envs/DataAnalysisProject/") +# Pkg.activate("./") +# cluster_name = ENV["BANYAN_CLUSTER_NAME"] +# delay_time = 5 + +# # Start a session and end it +# session_id_1 = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# force_synce = true, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# release_resources_after=delay_time +# ) +# resource_id_1 = get_session().resource_id +# session_status = get_session_status(session_id_1) +# @test session_status == "running" + +# end_session(session_id_1) +# sleep(60) # To ensure session gets ended +# session_status = get_session_status(session_id_1) +# @test session_status == "completed" + +# # Start another session with same nworkers and verify the job ID matches +# session_id_2 = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# release_resources_after=delay_time +# ) +# resource_id_2 = get_session().resource_id +# session_status = get_session_status(session_id_2) +# @test session_status == "running" +# @test resource_id_2 == resource_id_1 # it should have reused resource - end_session(session_id_2) - sleep(60) - session_status = get_session_status(session_id_2) - @test session_status == "completed" - - # Start another session with different nworkers and verify the job ID - # is different - session_id_3 = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 4, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - release_resources_after=delay_time - ) - resource_id_3 = get_session().resource_id - session_status = get_session_status(session_id_3) - @test session_status == "running" - @test resource_id_3 != resource_id_1 +# end_session(session_id_2) +# sleep(60) +# session_status = get_session_status(session_id_2) +# @test session_status == "completed" + +# # Start another session with different nworkers and verify the job ID +# # is different +# session_id_3 = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 4, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# release_resources_after=delay_time +# ) +# resource_id_3 = get_session().resource_id +# session_status = get_session_status(session_id_3) +# @test session_status == "running" +# @test resource_id_3 != resource_id_1 - end_session(session_id_3) - sleep(60) - session_status = get_session_status(session_id_3) - @test session_status == "completed" - - # Sleep for the delay_time and check that the underlying resources are destroyed - # by creating a new session and ensuring that it uses different resources - sleep(delay_time * 60) - session_id_4 = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - release_resources_after=delay_time, - nowait=true - ) - resource_id_4 = get_session().resource_id - @test resource_id_4 != resource_id_1 +# end_session(session_id_3) +# sleep(60) +# session_status = get_session_status(session_id_3) +# @test session_status == "completed" + +# # Sleep for the delay_time and check that the underlying resources are destroyed +# # by creating a new session and ensuring that it uses different resources +# sleep(delay_time * 60) +# session_id_4 = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# release_resources_after=delay_time, +# wait_now=false +# ) +# resource_id_4 = get_session().resource_id +# @test resource_id_4 != resource_id_1 - end_session(session_id_4, release_resources_now=true) -end +# end_session(session_id_4, release_resources_now=true) +# end + +# @testset "Start a session with dev paths" begin +# session_id = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# url = "https://github.com/banyan-team/banyan-julia.git", +# branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), +# directory = "banyan-julia/Banyan/test", +# dev_paths = [ +# "banyan-julia/Banyan", +# ], +# force_pull = true, +# force_sync = true, +# force_install = true, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# ) +# session_status = get_session_status(session_id) +# end_session(session_id, release_resources_now=true) +# @test session_status == "running" +# end + +# @testset "Create sessions with nowait=$nowait" for +# nowait in [true, false] +# Pkg.activate("./") +# cluster_name = ENV["BANYAN_CLUSTER_NAME"] + +# session_id = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# nowait=nowait +# ) -@testset "Start a session with dev paths" begin - session_id = start_session( +# session_status = get_session_status(session_id) +# if !nowait +# @test session_status == "running" +# else +# @test session_status == "creating" +# while session_status == "creating" +# sleep(20) +# session_status = get_session_status(session_id) +# end +# @test session_status == "running" +# end + +# end_session(session_id, release_resources_now=true) +# end + +# @testset "Create sessions where store_logs_in_s3=$store_logs_in_s3" for +# store_logs_in_s3 in [true, false] +# Pkg.activate("./") +# cluster_name = ENV["BANYAN_CLUSTER_NAME"] + +# session_id = start_session( +# cluster_name=cluster_name, +# nworkers = 2, +# store_logs_in_s3=store_logs_in_s3, +# ) +# end_session(session_id, release_resources_now=true) +# sleep(60) + +# log_file = "banyan-log-for-session-$session_id" +# println("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)") +# @test store_logs_in_s3 == isfile( +# S3Path("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)", +# config=Banyan.global_aws_config()) +# ) +# end + +# @testset "Starting session with failure in $scenario" for scenario in [ +# "invalid julia version", +# "invalid branch name", +# "invalid dev paths" +# ] +# Pkg.activate("./") + +# try +# if scenario == "invalid julia version" +# # Temporarily overwrite `get_julia_version` +# Banyan.get_julia_version() = "invalidversion" +# @test_throws begin +# session_id = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# ) +# end ErrorException +# elseif scenario == "invalid branch name" +# @test_throws begin +# session_id = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# url = "https://github.com/banyan-team/banyan-julia.git", +# branch = "nonexistant-branch", +# directory = "banyan-julia/Banyan/test", +# dev_paths = [ +# "banyan-julia/Banyan", +# ], +# force_pull = true, +# force_sync = true, +# force_install = true, +# ) +# end ErrorException +# elseif scenario == "invalid dev paths" +# @test_throws begin +# session_id = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# url = "https://github.com/banyan-team/banyan-julia.git", +# branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), +# directory = "banyan-julia/Banyan/test", +# dev_paths = [ +# "banyan-julia/Banyan", +# "banyan-julia/NonExistantPackage" +# ], +# force_pull = true, +# force_sync = true, +# force_install = true, +# ) +# end ErrorException +# end +# catch +# end +# end + +# @testset "Reusing session that fails" begin +# Pkg.activate("./") +# cluster_name = ENV["BANYAN_CLUSTER_NAME"] + +# # Start a session +# session_id_1 = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# force_sync=true +# ) +# resource_id_1 = get_session().resource_id +# session_status_1 = get_session_status(session_id_1) + +# # Trigger a failure in the session that will end the session +# try +# @test_throws begin +# offloaded(distributed=true) do +# error("Oops sorry this is an error") +# end +# end ErrorException +# catch +# end +# session_status_1_after_failure = get_session_status(session_id_1) + +# # Start a new session (it should reuse the resources of the failed session) and then end it +# session_id_2 = start_session( +# cluster_name = ENV["BANYAN_CLUSTER_NAME"], +# nworkers = 2, +# store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", +# wait_now=false +# ) +# resource_id_2 = get_session().resource_id +# session_status_2 = get_session_status(session_id_2) +# end_session(session_id_2, release_resources_now=true) + +# # Assert +# @test session_status_1 == "running" +# @test session_status_1_after_failure == "failed" +# @test resource_id_2 == resource_id_1 +# end + +@testset "Running session with print_logs=$print_logs and store_logs_in_s3=$store_logs_in_s3" for + print_logs in [true, false], + store_logs_in_s3 in [true, false] + + println("Before run_session") + run_session( + "file://run_session_test_script.jl", cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, + nworkers=1, url = "https://github.com/banyan-team/banyan-julia.git", branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), directory = "banyan-julia/Banyan/test", dev_paths = [ "banyan-julia/Banyan", ], - force_pull = true, - force_sync = true, - force_install = true, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", + print_logs = print_logs, + store_logs_in_s3 = store_logs_in_s3, + instance_type="t3.large", + disk_capacity="auto" ) - session_status = get_session_status(session_id) - end_session(session_id, release_resources_now=true) - @test session_status == "running" end -@testset "Create sessions with nowait=$nowait" for - nowait in [true, false] - Pkg.activate("./") - cluster_name = ENV["BANYAN_CLUSTER_NAME"] - - session_id = start_session( +@testset "Starting session" begin + s = start_session( cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - nowait=nowait - ) - - session_status = get_session_status(session_id) - if !nowait - @test session_status == "running" - else - @test session_status == "creating" - while session_status == "creating" - sleep(20) - session_status = get_session_status(session_id) - end - @test session_status == "running" - end - - end_session(session_id, release_resources_now=true) -end - -@testset "Create sessions where store_logs_in_s3=$store_logs_in_s3" for - store_logs_in_s3 in [true, false] - Pkg.activate("./") - cluster_name = ENV["BANYAN_CLUSTER_NAME"] - - session_id = start_session( - cluster_name=cluster_name, - nworkers = 2, - store_logs_in_s3=store_logs_in_s3, - ) - end_session(session_id, release_resources_now=true) - sleep(60) - - log_file = "banyan-log-for-session-$session_id" - println("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)") - @test store_logs_in_s3 == isfile( - S3Path("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)", - config=Banyan.global_aws_config()) + nworkers=1, + url = "https://github.com/banyan-team/banyan-julia.git", + branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), + directory = "banyan-julia/Banyan/test", + dev_paths = [ + "banyan-julia/Banyan", + ], + instance_type="t3.large", + disk_capacity="auto" ) -end - -@testset "Starting session with failure in $scenario" for scenario in [ - "invalid julia version", - "invalid branch name", - "invalid dev paths" -] - Pkg.activate("./") - - try - if scenario == "invalid julia version" - # Temporarily overwrite `get_julia_version` - Banyan.get_julia_version() = "invalidversion" - @test_throws begin - session_id = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - ) - end ErrorException - elseif scenario == "invalid branch name" - @test_throws begin - session_id = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - url = "https://github.com/banyan-team/banyan-julia.git", - branch = "nonexistant-branch", - directory = "banyan-julia/Banyan/test", - dev_paths = [ - "banyan-julia/Banyan", - ], - force_pull = true, - force_sync = true, - force_install = true, - ) - end ErrorException - elseif scenario == "invalid dev paths" - @test_throws begin - session_id = start_session( - cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - url = "https://github.com/banyan-team/banyan-julia.git", - branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), - directory = "banyan-julia/Banyan/test", - dev_paths = [ - "banyan-julia/Banyan", - "banyan-julia/NonExistantPackage" - ], - force_pull = true, - force_sync = true, - force_install = true, - ) - end ErrorException - end - catch - end -end - -@testset "Reusing session that fails" begin - Pkg.activate("./") - cluster_name = ENV["BANYAN_CLUSTER_NAME"] + @test get_session().id == get_session_id() + end_session(s) - # Start a session - session_id_1 = start_session( + s = start_session( cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - force_sync=true + nworkers=1, + url = "https://github.com/banyan-team/banyan-julia.git", + branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), + directory = "banyan-julia/Banyan/test", + dev_paths = [ + "banyan-julia/Banyan", + ], + start_now=true, + instance_type="t3.large", + disk_capacity="auto" ) - resource_id_1 = get_session().resource_id - session_status_1 = get_session_status(session_id_1) - - # Trigger a failure in the session that will end the session - try - @test_throws begin - offloaded(distributed=true) do - error("Oops sorry this is an error") - end - end ErrorException - catch - end - session_status_1_after_failure = get_session_status(session_id_1) - - # Start a new session (it should reuse the resources of the failed session) and then end it - session_id_2 = start_session( + @test get_session().id == get_session_id() + end_session(s) + + s = start_session( cluster_name = ENV["BANYAN_CLUSTER_NAME"], - nworkers = 2, - store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1", - nowait=true + nworkers=1, + url = "https://github.com/banyan-team/banyan-julia.git", + branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()), + directory = "banyan-julia/Banyan/test", + dev_paths = [ + "banyan-julia/Banyan", + ], + wait_now=true, + instance_type="t3.large", + disk_capacity="auto" ) - resource_id_2 = get_session().resource_id - session_status_2 = get_session_status(session_id_2) - end_session(session_id_2, release_resources_now=true) - - # Assert - @test session_status_1 == "running" - @test session_status_1_after_failure == "failed" - @test resource_id_2 == resource_id_1 + @test get_session().id == get_session_id() + end_session(s) end # Outdated testset...revisit later...probably alread tested through above tests diff --git a/BanyanArrays/test/Project.toml b/BanyanArrays/test/Project.toml index 1ed9683e..9d71cb8c 100644 --- a/BanyanArrays/test/Project.toml +++ b/BanyanArrays/test/Project.toml @@ -10,6 +10,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] -Banyan = "0.4.0" +Banyan = "0.4.1" ReTest = "0.3.2" julia = "^1.6" diff --git a/BanyanHDF5/test/Project.toml b/BanyanHDF5/test/Project.toml index 21ece6ad..6fe1cb9b 100644 --- a/BanyanHDF5/test/Project.toml +++ b/BanyanHDF5/test/Project.toml @@ -13,7 +13,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] -Banyan = "0.4.0" -BanyanArrays = "0.4.0" +Banyan = "0.4.1" +BanyanArrays = "0.4.1" ReTest = "0.3.2" julia = "^1.6" diff --git a/BanyanHDF5/test/runtests.jl b/BanyanHDF5/test/runtests.jl index 56b16b58..52a8d861 100644 --- a/BanyanHDF5/test/runtests.jl +++ b/BanyanHDF5/test/runtests.jl @@ -30,6 +30,7 @@ function use_session_for_testing( # Set the session and create a new one if needed global sessions_for_testing + println("sessions_for_testing=$(sessions_for_testing)") set_session( if haskey(sessions_for_testing, session_config_hash) sessions_for_testing[session_config_hash] @@ -69,6 +70,7 @@ function use_session_for_testing( ) # If selected session has already failed, this will throw an error. sessions_for_testing[session_config_hash] = get_session_id() + println("Set sessions_for_testing[session_config_hash] to get_session_id() for $(sessions_for_testing[session_config_hash])") configure_scheduling(name = scheduling_config_name) diff --git a/BanyanImages/test/Project.toml b/BanyanImages/test/Project.toml index 699fe6ac..5739e8e9 100644 --- a/BanyanImages/test/Project.toml +++ b/BanyanImages/test/Project.toml @@ -13,6 +13,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89" [compat] -Banyan = "0.4.0" +Banyan = "0.4.1" ReTest = "0.3.2" julia = "^1.6"