From 41ab5ba12f19f4ff338740241d7e48fdb46615f2 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 2 Feb 2024 12:07:36 -0700 Subject: [PATCH 01/84] patched shutting down thread on function result --- src/endpoint/FaabricEndpointHandler.cpp | 5 ++++- src/scheduler/Executor.cpp | 13 ++++++++++--- src/scheduler/Scheduler.cpp | 8 +++++++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/endpoint/FaabricEndpointHandler.cpp b/src/endpoint/FaabricEndpointHandler.cpp index aebb33668..2c70856d7 100644 --- a/src/endpoint/FaabricEndpointHandler.cpp +++ b/src/endpoint/FaabricEndpointHandler.cpp @@ -174,7 +174,10 @@ void FaabricEndpointHandler::onFunctionResult( faabric::util::funcToString(result, true)); response.body() = result.outputdata(); - return ctx.sendFunction(std::move(response)); + SPDLOG_DEBUG("Worker thread {} sending response", gettid()); + ctx.sendFunction(std::move(response)); + SPDLOG_DEBUG("Worker thread {} response sent", gettid()); + ctx.ioc.stop(); } } diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 9da893120..0c30bd4d8 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -457,7 +457,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // We terminate these threads by sending a shutdown message, but having this // check means they won't hang infinitely if destructed. while (!st.stop_requested()) { - SPDLOG_TRACE("Thread starting loop {}:{}", id, threadPoolIdx); + SPDLOG_DEBUG("Thread starting loop {}:{}", id, threadPoolIdx); ExecutorTask task; @@ -478,6 +478,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // will handle the clean-up if (task.messageIndex == POOL_SHUTDOWN) { SPDLOG_DEBUG("Killing thread pool thread {}:{}", id, threadPoolIdx); + st. return; } @@ -585,7 +586,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) assert(oldTaskCount >= 0); bool isLastInBatch = oldTaskCount == 1; - SPDLOG_TRACE("Task {} finished by thread {}:{} ({} left)", + SPDLOG_INFO("[Faabric] Task {} finished by thread {}:{} ({} left)", faabric::util::funcToString(msg, true), id, threadPoolIdx, @@ -645,6 +646,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // executor has been reset, otherwise the executor may not be reused for // a repeat invocation. if (isThreads) { + SPDLOG_INFO("Set result of the task"); ZoneScopedN("Task set result"); // Set non-final thread result if (isLastInBatch) { @@ -662,6 +664,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // the main function in a threaded application, in which case we // want to stop any tracking and delete the main thread snapshot if (!isThreads && isLastInBatch) { + SPDLOG_INFO("Not threads request and last in batch"); // Stop tracking memory std::span memView = getMemoryView(); if (!memView.empty()) { @@ -678,6 +681,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // claim. Note that we have to release the claim _after_ resetting, // otherwise the executor won't be ready for reuse if (isLastInBatch) { + SPDLOG_INFO("Last in batch detected, resetting executor and releasing claim"); // Threads skip the reset as they will be restored from their // respective snapshot on the next execution. if (isThreads || skippedExec) { @@ -693,7 +697,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) } task = ExecutorTask(); - + SPDLOG_INFO("Fetched executor task"); // Return this thread index to the pool available for scheduling { faabric::util::UniqueLock lock(threadsMutex); @@ -710,8 +714,11 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // try to schedule another function and be unable to reuse this // executor. ZoneScopedN("Task vacate slot"); + SPDLOG_INFO("Vacating slot"); sch.vacateSlot(); + SPDLOG_INFO("[Executor] Slot vacated"); } + SPDLOG_INFO("Calling soft shutdown"); softShutdown(); } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7b57c0c57..5bab4f6da 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -369,7 +369,13 @@ void Scheduler::addRegisteredHost(const std::string& host, void Scheduler::vacateSlot() { ZoneScopedNS("Vacate scheduler slot", 5); - thisHostUsedSlots.fetch_sub(1, std::memory_order_acq_rel); + SPDLOG_INFO("[Scheduler::vacateSlot() - Vacating slot"); + try { + thisHostUsedSlots.fetch_sub(1, std::memory_order_acq_rel); + SPDLOG_INFO("[Scheduler::vacateSlot() - Slot vacated"); + } catch (std::exception& ex) { + SPDLOG_ERROR("Caught exception vacating slot: {}", ex.what()); + } } faabric::util::SchedulingDecision Scheduler::callFunctions( From c157bcec1b4d99bc889eb14499d9446f0a3ecc27 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 2 Feb 2024 12:29:05 -0700 Subject: [PATCH 02/84] added debug --- src/endpoint/FaabricEndpointHandler.cpp | 2 +- src/scheduler/Executor.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/endpoint/FaabricEndpointHandler.cpp b/src/endpoint/FaabricEndpointHandler.cpp index 2c70856d7..fcc00d5b1 100644 --- a/src/endpoint/FaabricEndpointHandler.cpp +++ b/src/endpoint/FaabricEndpointHandler.cpp @@ -177,7 +177,7 @@ void FaabricEndpointHandler::onFunctionResult( SPDLOG_DEBUG("Worker thread {} sending response", gettid()); ctx.sendFunction(std::move(response)); SPDLOG_DEBUG("Worker thread {} response sent", gettid()); - ctx.ioc.stop(); + // We're done with this request } } diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index 0c30bd4d8..bfdb1252a 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -457,7 +457,7 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // We terminate these threads by sending a shutdown message, but having this // check means they won't hang infinitely if destructed. while (!st.stop_requested()) { - SPDLOG_DEBUG("Thread starting loop {}:{}", id, threadPoolIdx); + SPDLOG_INFO("Thread starting loop {}:{}", id, threadPoolIdx); ExecutorTask task; @@ -478,7 +478,6 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // will handle the clean-up if (task.messageIndex == POOL_SHUTDOWN) { SPDLOG_DEBUG("Killing thread pool thread {}:{}", id, threadPoolIdx); - st. return; } From 5986f5dbafbdb1ce5a567610d93cc5a214988d9d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 2 Feb 2024 13:48:19 -0700 Subject: [PATCH 03/84] Adds logging to the weird non-STL Queue --- include/faabric/util/queue.h | 21 ++++++++++++------ src/scheduler/Executor.cpp | 42 ++++++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/include/faabric/util/queue.h b/include/faabric/util/queue.h index 6d89aab18..b74e77516 100644 --- a/include/faabric/util/queue.h +++ b/include/faabric/util/queue.h @@ -54,10 +54,13 @@ class Queue SPDLOG_ERROR("Invalid queue timeout: {} <= 0", timeoutMs); throw std::runtime_error("Invalid queue timeout"); } - - while (mq.empty()) { + + while (mq.size() == 0) { + SPDLOG_DEBUG("Queue is empty... waiting for dequeue"); std::cv_status returnVal = enqueueNotifier.wait_for( lock, std::chrono::milliseconds(timeoutMs)); + + SPDLOG_DEBUG("Queue has been notified"); // Work out if this has returned due to timeout expiring if (returnVal == std::cv_status::timeout) { @@ -65,11 +68,15 @@ class Queue } } - T value = std::move(mq.front()); - mq.pop(); - emptyNotifier.notify_one(); - - return value; + try { + T value = std::move(mq.front()); + mq.pop(); + emptyNotifier.notify_one(); + return value; + } catch (std::exception& e) { + SPDLOG_ERROR("Caught exception when dequeueing: {}", e.what()); + throw; + } } T* peek(long timeoutMs = 0) diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index bfdb1252a..b14d9d001 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -433,8 +433,10 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) } }; if (threadPoolIdx == 0) { + SPDLOG_INFO("Thread pool thread {}:{} is the main thread", id, threadPoolIdx); std::unique_lock _lock(resetMutex); try { + SPDLOG_INFO("Resetting module"); reset(boundMessage); } catch (...) { SPDLOG_ERROR("Caught exception when initialising module for {}", @@ -463,9 +465,22 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) try { ZoneScopedNC("Dequeue task", 0x111111); - task = threadTaskQueues[threadPoolIdx].dequeue(conf.boundTimeout); + SPDLOG_DEBUG("Dequeueing task for thread {}:{} (timeout {}ms)", + id, + threadPoolIdx, + conf.boundTimeout); + + task = threadTaskQueues.at(threadPoolIdx).dequeue(conf.boundTimeout); + SPDLOG_DEBUG("Successfully dequeued task for thread {}:{}", + id, + threadPoolIdx); + } catch (std::bad_optional_access& e) { + SPDLOG_DEBUG("Bad optional access in thread {}:{}", + id, + threadPoolIdx); + continue; } catch (faabric::util::QueueTimeoutException& ex) { - SPDLOG_TRACE( + SPDLOG_DEBUG( "Thread {}:{} got no messages in timeout {}ms, looping", id, threadPoolIdx, @@ -482,15 +497,16 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) } assert(task.req->messages_size() >= task.messageIndex + 1); - faabric::Message& msg = - task.req->mutable_messages()->at(task.messageIndex); + faabric::Message& msg = task.req->mutable_messages()->at(task.messageIndex); // Start dirty tracking if executing threads across hosts bool isSingleHost = task.req->singlehost(); - bool isThreads = - task.req->type() == faabric::BatchExecuteRequest::THREADS; + bool isThreads = task.req->type() == faabric::BatchExecuteRequest::THREADS; bool doDirtyTracking = isThreads && !isSingleHost; if (doDirtyTracking) { + SPDLOG_DEBUG("Starting dirty tracking for thread {}:{}", + id, + threadPoolIdx); // If tracking is thread local, start here as it will happen for // each thread tracker->startThreadLocalTracking(getMemoryView()); @@ -499,11 +515,10 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) // Check ptp group std::shared_ptr group = nullptr; if (msg.groupid() > 0) { - group = - faabric::transport::PointToPointGroup::getGroup(msg.groupid()); + group = faabric::transport::PointToPointGroup::getGroup(msg.groupid()); } - SPDLOG_TRACE("Thread {}:{} executing task {} ({}, thread={}, group={})", + SPDLOG_INFO("Thread {}:{} executing task {} ({}, thread={}, group={})", id, threadPoolIdx, task.messageIndex, @@ -512,11 +527,16 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) msg.groupid()); // Set up context - getScheduler().monitorStartedTasks.fetch_add(1, - std::memory_order_acq_rel); + SPDLOG_INFO("Setting executor context for task {}:{}", + id, + threadPoolIdx); + getScheduler().monitorStartedTasks.fetch_add(1,std::memory_order_acq_rel); ExecutorContext::set(this, task.req, task.messageIndex); // Execute the task + SPDLOG_INFO("Executing task {}:{}", + id, + threadPoolIdx); int64_t msgTimestamp = msg.timestamp(); int64_t nowTimestamp = faabric::util::getGlobalClock().epochMillis(); int32_t returnValue; From 546c3f0b14bbbb4eed77182b33e57c7cd13ab057 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 6 Feb 2024 08:20:11 -0700 Subject: [PATCH 04/84] adds exception handling to setFunctionResult --- include/faabric/scheduler/Scheduler.h | 6 +++++- src/scheduler/Executor.cpp | 12 ++++++++++-- src/scheduler/FunctionCallServer.cpp | 17 +++++++++++++++-- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 70c630e79..20d2ffcf6 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -277,7 +277,11 @@ class Scheduler inline void setFunctionResult(const faabric::Message& msg) { - setFunctionResult(std::make_unique(msg)); + try { + setFunctionResult(std::make_unique(msg)); + } catch (const std::exception& e) { + SPDLOG_ERROR("[Scheduler.h] Failed to set function result: {}", e.what()); + } } faabric::Message getFunctionResult(unsigned int messageId, diff --git a/src/scheduler/Executor.cpp b/src/scheduler/Executor.cpp index b14d9d001..6f6d3669d 100644 --- a/src/scheduler/Executor.cpp +++ b/src/scheduler/Executor.cpp @@ -428,7 +428,11 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) if (isThreads) { sch.setThreadResult(msg, 1, "", {}); } else { - sch.setFunctionResult(msg); + try { + sch.setFunctionResult(msg); + } catch (const std::exception& ex) { + SPDLOG_ERROR("[Executor.cpp::threadPoolThread] Failed to set function result: {}", ex.what()); + } } } }; @@ -677,7 +681,11 @@ void Executor::threadPoolThread(std::stop_token st, int threadPoolIdx) } else { ZoneScopedN("Task set result"); // Set normal function result - sch.setFunctionResult(msg); + try { + sch.setFunctionResult(msg); + } catch (const std::exception& ex) { + SPDLOG_ERROR("Failed to set function result: {}", ex.what()); + } } // If this is not a threads request and last in its batch, it may be // the main function in a threaded application, in which case we diff --git a/src/scheduler/FunctionCallServer.cpp b/src/scheduler/FunctionCallServer.cpp index 5ca8cea29..80d6854be 100644 --- a/src/scheduler/FunctionCallServer.cpp +++ b/src/scheduler/FunctionCallServer.cpp @@ -136,6 +136,11 @@ void FunctionCallServer::recvDirectResult(std::span buffer) PARSE_MSG(faabric::DirectResultTransmission, buffer.data(), buffer.size()) std::unique_ptr result{ parsedMsg.release_result() }; + try { + scheduler.setFunctionResult(std::move(result)); + } catch (const std::exception& e) { + SPDLOG_ERROR("Failed to set direct result: {}", e.what()); + } scheduler.setFunctionResult(std::move(result)); } @@ -150,6 +155,7 @@ FunctionCallServer::recvPendingMigrations(std::span buffer) scheduler.addPendingMigration(msgPtr); return std::make_unique(); + } std::unique_ptr @@ -157,11 +163,18 @@ FunctionCallServer::recvNdpDeltaRequest(std::span buffer) { PARSE_MSG(faabric::GetNdpDelta, buffer.data(), buffer.size()); - auto ndpDelta = ndpDeltaHandlers.get(parsedMsg.id()).value()(); + auto ndpDelta = ndpDeltaHandlers.get(parsedMsg.id()); + + if (!ndpDelta.has_value()) { + SPDLOG_ERROR("No NDP delta handler found for id {}", parsedMsg.id()); + return std::make_unique(); + } + + std::vector ndpDeltaData = ndpDelta.value()(); auto response = std::make_unique(); response->mutable_delta()->assign( - reinterpret_cast(ndpDelta.data()), ndpDelta.size()); + reinterpret_cast(ndpDeltaData.data()), ndpDeltaData.size()); return response; } } From e403138e52feeb4aff48d89bb3b63a21db65e864 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 6 Feb 2024 09:49:20 -0700 Subject: [PATCH 05/84] attempts to fix funky delta --- src/scheduler/FunctionCallClient.cpp | 2 ++ src/scheduler/FunctionCallServer.cpp | 6 ++++-- src/scheduler/Scheduler.cpp | 5 ++++- src/util/delta.cpp | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/scheduler/FunctionCallClient.cpp b/src/scheduler/FunctionCallClient.cpp index 8089013cc..aee8be126 100644 --- a/src/scheduler/FunctionCallClient.cpp +++ b/src/scheduler/FunctionCallClient.cpp @@ -196,10 +196,12 @@ void FunctionCallClient::unregister(faabric::UnregisterRequest& req) faabric::NdpDelta FunctionCallClient::requestNdpDelta(int msgId) { + SPDLOG_DEBUG("Requesting NDP delta for message {}", msgId); faabric::GetNdpDelta gnd; gnd.set_id(msgId); faabric::NdpDelta delta; syncSend(faabric::scheduler::FunctionCalls::NdpDeltaRequest, &gnd, &delta); + SPDLOG_DEBUG("Received NDP delta for message {}", msgId); return delta; } } diff --git a/src/scheduler/FunctionCallServer.cpp b/src/scheduler/FunctionCallServer.cpp index 80d6854be..95267aaae 100644 --- a/src/scheduler/FunctionCallServer.cpp +++ b/src/scheduler/FunctionCallServer.cpp @@ -28,11 +28,13 @@ void FunctionCallServer::registerNdpDeltaHandler( int id, std::function()> handler) { + SPDLOG_DEBUG("Registering NDP delta handler for id {}", id); ndpDeltaHandlers.insertOrAssign(id, std::move(handler)); } void FunctionCallServer::removeNdpDeltaHandler(int id) { + SPDLOG_DEBUG("Removing NDP delta handler for id {}", id); ndpDeltaHandlers.erase(id); } @@ -74,6 +76,7 @@ std::unique_ptr FunctionCallServer::doSyncRecv( return recvPendingMigrations(message.udata()); } case faabric::scheduler::FunctionCalls::NdpDeltaRequest: { + SPDLOG_DEBUG("Received NDP delta request"); return recvNdpDeltaRequest(message.udata()); } default: { @@ -141,7 +144,6 @@ void FunctionCallServer::recvDirectResult(std::span buffer) } catch (const std::exception& e) { SPDLOG_ERROR("Failed to set direct result: {}", e.what()); } - scheduler.setFunctionResult(std::move(result)); } std::unique_ptr @@ -155,7 +157,7 @@ FunctionCallServer::recvPendingMigrations(std::span buffer) scheduler.addPendingMigration(msgPtr); return std::make_unique(); - + } std::unique_ptr diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 5bab4f6da..7902965b6 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -1240,8 +1241,10 @@ void Scheduler::setFunctionResult(std::unique_ptr msg) if (it != localResults.end()) { it->second->setValue(std::move(msg)); } else { + SPDLOG_ERROR("Result received for unknown message {}! Removing delta handler as a precaution", msg->id()); + faabric::scheduler::FunctionCallServer::removeNdpDeltaHandler(msg->id()); throw std::runtime_error( - "Got direct result, but promise is registered"); + "Result received for unknown message " + std::to_string(msg->id())); } return; } diff --git a/src/util/delta.cpp b/src/util/delta.cpp index fea09b933..07e565644 100644 --- a/src/util/delta.cpp +++ b/src/util/delta.cpp @@ -236,7 +236,7 @@ void applyDelta(std::span delta, std::function getDataPointer) { size_t deltaLen = delta.size(); - if (deltaLen < 2) { + if (deltaLen < 1) { throw std::runtime_error("Delta too short to be valid"); } if (delta[0] != DELTA_PROTOCOL_VERSION) { From b05e6a3ae443b449829008fc9eecede55dc04ab0 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Sun, 11 Feb 2024 15:17:13 +0000 Subject: [PATCH 06/84] refactors systems metrics to faabric scheduler --- src/util/system_metrics.cpp | 109 ++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 src/util/system_metrics.cpp diff --git a/src/util/system_metrics.cpp b/src/util/system_metrics.cpp new file mode 100644 index 000000000..2ff6bbd73 --- /dev/null +++ b/src/util/system_metrics.cpp @@ -0,0 +1,109 @@ + +#pragma once +#include +#include +#include + + +namespace faabric::util { + struct UtilisationStats + { + double cpu_utilisation; + double ram_utilisation; + double load_average; + }; + + struct CPUStats + { + long totalCpuTime; + long idleCpuTime; + }; + + struct MemStats + { + uint64_t total; + uint64_t available; + }; + + CPUStats getCPUStats() { + SPDLOG_INFO("[ndp_endpoint::getCPUUtilisation] Getting CPU utilisation"); + std::ifstream cpuinfo("/proc/stat"); + std::string line; + if (!cpuinfo.is_open()) { + throw std::runtime_error("Unable to open /proc/stat"); + } + std::getline(cpuinfo, line); + // Extract CPU utilization information from the line + std::istringstream iss(line); + std::string cpuLabel; + long user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice; + iss >> cpuLabel >> user >> nice >> system >> idle >> iowait >> irq >> softirq >> steal >> guest >> guest_nice; + // Calculate total CPU time + long totalCpuTime = user + nice + system + idle + iowait + irq + softirq + steal + guest + guest_nice; + CPUStats stats; + stats.totalCpuTime = totalCpuTime; + stats.idleCpuTime = idle; + return stats; + } + + double getMemoryUtilisation() + { + std::ifstream meminfo("/proc/meminfo"); + std::string line; + if (!meminfo.is_open()) { + throw std::runtime_error("Unable to open /proc/meminfo"); + } + std::getline(meminfo, line); + std::istringstream ss(line); + std::string mem; + ss >> mem; + uint64_t totalMem; + ss >> totalMem; + std::getline(meminfo, line); + ss = std::istringstream(line); + ss >> mem; + uint64_t availableMem; + ss >> availableMem; + return 1.0 - (availableMem / (double)totalMem); + } + + double getLoadAverage() + { + std::ifstream loadavg("/proc/loadavg"); + std::string line; + if (!loadavg.is_open()) { + throw std::runtime_error("Unable to open /proc/loadavg"); + } + + std::getline(loadavg, line); + std::istringstream ss(line); + double load1, load5, load15; + ss >> load1 >> load5 >> load15; + return load1; + } + + UtilisationStats getSystemUtilisation() + { + UtilisationStats stats; + // Get initial figures + CPUStats cpuStart = getCPUUtilisation(); + SPDLOG_DEBUG("Total CPU time: {}", cpuStart.totalCpuTime); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + // Get final figures + CPUStats cpuEnd = getCPUUtilisation(); + SPDLOG_DEBUG("Total CPU time after wait: {}", cpuEnd.totalCpuTime); + long cpuTimeDelta = cpuEnd.totalCpuTime - cpuStart.totalCpuTime; + long idleTimeDelta = cpuEnd.idleCpuTime - cpuStart.idleCpuTime; + double cpu_utilisation = 1.0 - (idleTimeDelta / (double) cpuTimeDelta); + stats.cpu_utilisation = cpu_utilisation; + stats.ram_utilisation = getMemoryUtilisation(); + stats.load_average = getLoadAverage(); + return stats; + } + + + + +} + + From f4492c3c47b281c6adfedf80a1f135fb92f5e65e Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Sun, 11 Feb 2024 15:25:26 +0000 Subject: [PATCH 07/84] updates CMAKELists for faabric::util library --- src/util/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 6d0f81eef..0cbbd6263 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -27,4 +27,5 @@ faabric_lib(util string_tools.cpp timing.cpp testing.cpp + system_metrics.cpp ) From e83ca1eb02c70ab0d42fc59ef97e8b8ed430b68c Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Sun, 11 Feb 2024 15:26:49 +0000 Subject: [PATCH 08/84] removes pragma once from cpp --- src/util/system_metrics.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/util/system_metrics.cpp b/src/util/system_metrics.cpp index 2ff6bbd73..8a3f93988 100644 --- a/src/util/system_metrics.cpp +++ b/src/util/system_metrics.cpp @@ -1,5 +1,4 @@ -#pragma once #include #include #include From 62c2e2f9a0d8a5ebeafdcfa6097845a7b1fb89ac Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Sun, 11 Feb 2024 15:29:03 +0000 Subject: [PATCH 09/84] removes pragma once from cpp --- src/util/system_metrics.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/util/system_metrics.cpp b/src/util/system_metrics.cpp index 8a3f93988..6ca9df52b 100644 --- a/src/util/system_metrics.cpp +++ b/src/util/system_metrics.cpp @@ -2,7 +2,12 @@ #include #include #include - +#include +#include +#include +#include +#include +#include namespace faabric::util { struct UtilisationStats @@ -24,8 +29,7 @@ namespace faabric::util { uint64_t available; }; - CPUStats getCPUStats() { - SPDLOG_INFO("[ndp_endpoint::getCPUUtilisation] Getting CPU utilisation"); + CPUStats getCPUUtilisation() { std::ifstream cpuinfo("/proc/stat"); std::string line; if (!cpuinfo.is_open()) { @@ -84,13 +88,13 @@ namespace faabric::util { UtilisationStats getSystemUtilisation() { UtilisationStats stats; + // Get initial figures CPUStats cpuStart = getCPUUtilisation(); - SPDLOG_DEBUG("Total CPU time: {}", cpuStart.totalCpuTime); std::this_thread::sleep_for(std::chrono::milliseconds(1)); + // Get final figures CPUStats cpuEnd = getCPUUtilisation(); - SPDLOG_DEBUG("Total CPU time after wait: {}", cpuEnd.totalCpuTime); long cpuTimeDelta = cpuEnd.totalCpuTime - cpuStart.totalCpuTime; long idleTimeDelta = cpuEnd.idleCpuTime - cpuStart.idleCpuTime; double cpu_utilisation = 1.0 - (idleTimeDelta / (double) cpuTimeDelta); From f7b92bb4ffddffa771f79f0f542c0553198a5515 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Sun, 11 Feb 2024 15:32:15 +0000 Subject: [PATCH 10/84] moves definition of system_metrics to header file --- include/faabric/util/system_metrics.h | 35 +++++++++++++++++++++++++++ src/util/system_metrics.cpp | 32 ++---------------------- 2 files changed, 37 insertions(+), 30 deletions(-) create mode 100644 include/faabric/util/system_metrics.h diff --git a/include/faabric/util/system_metrics.h b/include/faabric/util/system_metrics.h new file mode 100644 index 000000000..8b0ba1062 --- /dev/null +++ b/include/faabric/util/system_metrics.h @@ -0,0 +1,35 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faabric::util { + struct UtilisationStats + { + double cpu_utilisation; + double ram_utilisation; + double load_average; + }; + + struct CPUStats + { + long totalCpuTime; + long idleCpuTime; + }; + + struct MemStats + { + uint64_t total; + uint64_t available; + }; + + UtilisationStats getSystemUtilisation(); + CPUStats getCPUUtilisation(); + MemStats getMemoryUtilisation(); +} \ No newline at end of file diff --git a/src/util/system_metrics.cpp b/src/util/system_metrics.cpp index 6ca9df52b..dc3b4b6eb 100644 --- a/src/util/system_metrics.cpp +++ b/src/util/system_metrics.cpp @@ -1,33 +1,5 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include namespace faabric::util { - struct UtilisationStats - { - double cpu_utilisation; - double ram_utilisation; - double load_average; - }; - - struct CPUStats - { - long totalCpuTime; - long idleCpuTime; - }; - - struct MemStats - { - uint64_t total; - uint64_t available; - }; CPUStats getCPUUtilisation() { std::ifstream cpuinfo("/proc/stat"); @@ -88,7 +60,7 @@ namespace faabric::util { UtilisationStats getSystemUtilisation() { UtilisationStats stats; - + // Get initial figures CPUStats cpuStart = getCPUUtilisation(); std::this_thread::sleep_for(std::chrono::milliseconds(1)); From d5770f94804ae477f1a76b0e19ef6edd6e745b4d Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Sun, 11 Feb 2024 15:35:50 +0000 Subject: [PATCH 11/84] fixes function signature for MemStats --- include/faabric/util/system_metrics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/faabric/util/system_metrics.h b/include/faabric/util/system_metrics.h index 8b0ba1062..06aa4399e 100644 --- a/include/faabric/util/system_metrics.h +++ b/include/faabric/util/system_metrics.h @@ -31,5 +31,5 @@ namespace faabric::util { UtilisationStats getSystemUtilisation(); CPUStats getCPUUtilisation(); - MemStats getMemoryUtilisation(); + double getMemoryUtilisation(); } \ No newline at end of file From 425d2705b187e6104ec7b8659fc591967ce8e679 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Tue, 13 Feb 2024 12:49:09 +0000 Subject: [PATCH 12/84] attempts to add metrics requests --- .vscode/settings.json | 44 ++++++++++++++++++++++++++++++ include/faabric/proto/faabric.pb.h | 4 ++- src/proto/faabric.proto | 7 +++++ src/util/json.cpp | 5 ++++ 4 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..5292e8a50 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,44 @@ +{ + "editor.tokenColorCustomizations": { + "[*Light*]": { + "textMateRules": [ + { + "scope": "ref.matchtext", + "settings": { + "foreground": "#000" + } + } + ] + }, + "[*Dark*]": { + "textMateRules": [ + { + "scope": "ref.matchtext", + "settings": { + "foreground": "#fff" + } + } + ] + }, + "textMateRules": [ + { + "scope": "googletest.failed", + "settings": { + "foreground": "#f00" + } + }, + { + "scope": "googletest.passed", + "settings": { + "foreground": "#0f0" + } + }, + { + "scope": "googletest.run", + "settings": { + "foreground": "#0f0" + } + } + ] + } +} \ No newline at end of file diff --git a/include/faabric/proto/faabric.pb.h b/include/faabric/proto/faabric.pb.h index e58144c6a..0a6dfa8f1 100644 --- a/include/faabric/proto/faabric.pb.h +++ b/include/faabric/proto/faabric.pb.h @@ -53,6 +53,7 @@ struct MessageRecord final std::string pythonFunction; std::string cmdline; bool forbidNdp = false; + bool isMetricsRequest = false; MessageRecord() = default; MessageRecord(const faabric::Message& msg) @@ -63,7 +64,8 @@ struct MessageRecord final , pythonUser(msg.pythonuser()) , pythonFunction(msg.pythonfunction()) , cmdline(msg.cmdline()) - , forbidNdp(msg.forbidndp()) + , forbidNdp(msg.forbidndp()), + , isMetricsRequest(msg.ismetricsrequest()) { } ~MessageRecord() = default; diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index adad18e42..d89a33b05 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -72,6 +72,12 @@ message FunctionStatusResponse { FunctionStatus status = 1; } +message NodeUtilisationResponse { + double cpu_utilisation = 1; + double mem_utilisation = 2; + double load_avg = 3; +} + // --------------------------------------------- // MPI // --------------------------------------------- @@ -177,6 +183,7 @@ message Message { optional bool forbidNdp = 95; repeated int32 extraArguments = 96; string ndpCallObjectName = 97; + optional bool isMetricsRequest = 98; } message DirectResultTransmission { diff --git a/src/util/json.cpp b/src/util/json.cpp index ccd4777ca..211995a3f 100644 --- a/src/util/json.cpp +++ b/src/util/json.cpp @@ -223,6 +223,10 @@ std::string messageToJson(const faabric::Message& msg) d.AddMember("forbid_ndp", Value(msg.forbidndp()), a); } + if (msg.isMetricsRequest()) { + d.AddMember("is_metrics_request", Value(msg.isMetricsRequest()), a); + } + StringBuffer sb; Writer writer(sb); d.Accept(writer); @@ -435,6 +439,7 @@ faabric::Message jsonToMessage(const std::string& jsonIn) getBoolFromJson(d, "is_output_memory_delta", false)); msg.set_directresulthost(getStringFromJson(d, "direct_result_host", "")); msg.set_forbidndp(getBoolFromJson(d, "forbid_ndp", false)); + msg.set_ismetricsrequest(getBoolFromJson(d, "isMetricsRequest", false)); PROF_END(jsonDecode) From 283038b609b11c49135b450b94979a6f27e03b59 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Tue, 13 Feb 2024 13:25:51 +0000 Subject: [PATCH 13/84] updated scheduler to shutdown on crashing --- src/redis/Redis.cpp | 2 +- src/scheduler/Scheduler.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/redis/Redis.cpp b/src/redis/Redis.cpp index a2780412a..db17c278c 100644 --- a/src/redis/Redis.cpp +++ b/src/redis/Redis.cpp @@ -678,7 +678,7 @@ UniqueRedisReply Redis::dequeueBase(const std::string& queueName, int timeoutMs) // Check if we got anything if (reply == nullptr || reply->type == REDIS_REPLY_NIL) { std::string msg = - fmt::format("No response from Redis dequeue in {}ms for queue {}", + fmt::format("d from Redis dequeue in {}ms for queue {}", timeoutMs, queueName); throw RedisNoResponseException(msg); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7902965b6..2afb041f7 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -103,6 +103,8 @@ Scheduler::Scheduler() Scheduler::~Scheduler() { + + shutdown(); if (!_isShutdown) { SPDLOG_ERROR("Destructing scheduler without shutting down first"); } From 609a47cc489ad1eae3583ce504e6d09f8dbe38e9 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Tue, 13 Feb 2024 13:29:03 +0000 Subject: [PATCH 14/84] added exception handling to faabric endpoint --- src/endpoint/FaabricEndpoint.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/endpoint/FaabricEndpoint.cpp b/src/endpoint/FaabricEndpoint.cpp index 1510dafaa..2ad2aa8e4 100644 --- a/src/endpoint/FaabricEndpoint.cpp +++ b/src/endpoint/FaabricEndpoint.cpp @@ -78,7 +78,27 @@ class HttpConnection : public std::enable_shared_from_this stream.get_executor(), std::bind_front(&HttpConnection::sendResponse, this->shared_from_this()) }; - handler->onRequest(std::move(hrc), std::move(msg)); + try { + handler->onRequest(std::move(hrc), std::move(msg)); + } catch (std::exception& e) { + SPDLOG_ERROR("Error handling HTTP request: {}", e.what()); + faabric::util::BeastHttpResponse response; + response.result(beast::http::status::internal_server_error); + response.body() = e.what(); + sendResponse(std::move(response)); + } catch (boost::system::system_error& e) { + SPDLOG_ERROR("Error handling HTTP request: {}", e.what()); + faabric::util::BeastHttpResponse response; + response.result(beast::http::status::internal_server_error); + response.body() = e.what(); + sendResponse(std::move(response)); + } catch(...) { + SPDLOG_ERROR("Error handling HTTP request: unknown exception"); + faabric::util::BeastHttpResponse response; + response.result(beast::http::status::internal_server_error); + response.body() = "Unknown error"; + sendResponse(std::move(response)); + } } void onRead(beast::error_code ec, size_t bytesTransferred) From ff7967394774b54be55c9578badc908a2417778a Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 00:09:41 +0000 Subject: [PATCH 15/84] added entry-point debug statements --- src/scheduler/Scheduler.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7902965b6..31ee1cf39 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -307,6 +307,7 @@ int Scheduler::reapStaleExecutors() req.set_user(user); req.set_function(function); + SPDLOG_DEBUG("Unregistering {} from {}", key, thisHost); getFunctionCallClient(masterHost)->unregister(req); } @@ -407,6 +408,7 @@ faabric::util::SchedulingDecision Scheduler::callFunctions( SPDLOG_DEBUG("Forwarding {} back to master {}", funcStr, masterHost); ZoneScopedN("Scheduler::callFunctions forward to master"); + SPDLOG_DEBUG("Forwarding {} to master {}", funcStr, masterHost); getFunctionCallClient(masterHost)->executeFunctions(req); SchedulingDecision decision(firstMsg.appid(), firstMsg.groupid()); decision.returnHost = masterHost; @@ -989,6 +991,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( } // Dispatch the calls + SPDLOG_DEBUG("Dispatching {} to {}", funcStr, host); getFunctionCallClient(host)->executeFunctions(hostRequest); } } @@ -1207,6 +1210,7 @@ void Scheduler::broadcastFlush() allHosts.erase(thisHost); // Dispatch flush message to all other hosts + SPDLOG_DEBUG("Broadcasting flush to {} hosts", allHosts.size()); for (auto& otherHost : allHosts) { getFunctionCallClient(otherHost)->sendFlush(); } @@ -1263,6 +1267,7 @@ void Scheduler::setFunctionResult(std::unique_ptr msg) if (!directResultHost.empty()) { ZoneScopedN("Direct result send"); faabric::util::FullLock lock(mx); + SPDLOG_DEBUG("Sending direct result for {} to {}", msg->id(), directResultHost); auto fc = getFunctionCallClient(directResultHost); lock.unlock(); { @@ -1650,7 +1655,7 @@ void Scheduler::setThisHostResources(faabric::HostResources& res) faabric::HostResources Scheduler::getHostResources(const std::string& host) { - SPDLOG_TRACE("Requesting resources from {}", host); + SPDLOG_DEBUG("Requesting resources from {}", host); return getFunctionCallClient(host)->getResources(); } @@ -1831,6 +1836,7 @@ void Scheduler::broadcastPendingMigrations( registeredHosts.erase(thisHost); // Send pending migrations to all involved hosts + SPDLOG_DEUBG("Broadcasting pending migrations for app {}", msg.appid()); for (auto& otherHost : thisRegisteredHosts) { getFunctionCallClient(otherHost)->sendPendingMigrations( pendingMigrations); From 540abf847210b58eccef53304c5f8c1b9ea4abdc Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 00:11:27 +0000 Subject: [PATCH 16/84] Fix debug log statement in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 31ee1cf39..dabceab24 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1836,7 +1836,7 @@ void Scheduler::broadcastPendingMigrations( registeredHosts.erase(thisHost); // Send pending migrations to all involved hosts - SPDLOG_DEUBG("Broadcasting pending migrations for app {}", msg.appid()); + SPDLOG_DEBUG("Broadcasting pending migrations for app {}", msg.appid()); for (auto& otherHost : thisRegisteredHosts) { getFunctionCallClient(otherHost)->sendPendingMigrations( pendingMigrations); From 3ffd4d63be9f31b18cc2cb899d41180b21d9a162 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 00:22:24 +0000 Subject: [PATCH 17/84] Fix host scheduling logic and add debug logging --- src/scheduler/Scheduler.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index dabceab24..29708b5d2 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -530,7 +530,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // Make sure we don't execute the wrong kind (storage/compute) of // call locally - if (hostKindDifferent) { + if (hostKindDifferent) { + SPDLOG_DEBUG("Host kind different, not scheduling {} locally", + funcStr); nLocally = 0; } @@ -544,14 +546,18 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // If some are left, we need to distribute. // First try and do so on already registered hosts. int remainder = nMessages - nLocally; - if (!hostKindDifferent && remainder > 0) { + SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", + remainder, + nMessages, + funcStr); const std::set& thisRegisteredHosts = getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); for (const auto& h : thisRegisteredHosts) { // Work out resources on the remote host + SPDLOG_DEBUG("Checking {} for resources", h); faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); @@ -607,6 +613,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( lastHost = h; // Work out resources on the remote host + SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); From 6dd73dd282f1b38ebd4242f6010323ca2e9bddb7 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 00:42:33 +0000 Subject: [PATCH 18/84] Add array file association for cpp files and handle exceptions in FaabricEndpointHandler::onRequest --- .vscode/settings.json | 5 +++++ src/endpoint/FaabricEndpointHandler.cpp | 18 ++++++++++++++++-- src/scheduler/Scheduler.cpp | 2 ++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..889b02bd5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "files.associations": { + "array": "cpp" + } +} \ No newline at end of file diff --git a/src/endpoint/FaabricEndpointHandler.cpp b/src/endpoint/FaabricEndpointHandler.cpp index fcc00d5b1..63ade2b08 100644 --- a/src/endpoint/FaabricEndpointHandler.cpp +++ b/src/endpoint/FaabricEndpointHandler.cpp @@ -89,8 +89,22 @@ void FaabricEndpointHandler::onRequest( response.result(beast::http::status::ok); response.body() = std::string("Flush sent"); } else { - executeFunction( - std::move(ctx), std::move(response), std::move(msg)); + try + { + executeFunction(std::move(ctx), std::move(response), std::move(msg)); + } + catch (const std::exception& e) + { + SPDLOG_ERROR("Caught exception in FaabricEndpointHandler::onRequest: {}", e.what()); + response.result(beast::http::status::internal_server_error); + response.body() = std::string("Caught exception: ") + e.what(); + ctx.sendFunction(std::move(response)); + } catch (faabric::util::FaabricException& e) { + SPDLOG_ERROR("Caught FaabricException in FaabricEndpointHandler::onRequest: {}", e.what()); + response.result(beast::http::status::internal_server_error); + response.body() = std::string("Caught exception: ") + e.what(); + ctx.sendFunction(std::move(response)); + } return; } } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 29708b5d2..c5a2aedb8 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -614,6 +614,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( lastHost = h; // Work out resources on the remote host SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); + SPDLOG_DEBUG("Remaining: {}", remainder); + faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); From f2f01849eae98603aa3f1223f9af265743b91e5d Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 01:07:41 +0000 Subject: [PATCH 19/84] Refactor FaabricEndpointHandler and Scheduler --- src/endpoint/FaabricEndpointHandler.cpp | 2 +- src/scheduler/Scheduler.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/endpoint/FaabricEndpointHandler.cpp b/src/endpoint/FaabricEndpointHandler.cpp index 63ade2b08..cc29eb0e5 100644 --- a/src/endpoint/FaabricEndpointHandler.cpp +++ b/src/endpoint/FaabricEndpointHandler.cpp @@ -189,7 +189,7 @@ void FaabricEndpointHandler::onFunctionResult( response.body() = result.outputdata(); SPDLOG_DEBUG("Worker thread {} sending response", gettid()); - ctx.sendFunction(std::move(response)); + return ctx.sendFunction(std::move(response)); SPDLOG_DEBUG("Worker thread {} response sent", gettid()); // We're done with this request } diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index c5a2aedb8..8ffb3ad03 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -618,20 +618,21 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); - // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than // its available slots. available = std::max(0, available); int nOnThisHost = std::min(available, remainder); + SPDLOG_DEBUG("Unregisted Host Available Slots: {}, nOnThisHost: {}", available, nOnThisHost); + if (topologyHint == faabric::util::SchedulingTopologyHint::NEVER_ALONE && nOnThisHost < 2) { continue; } - SPDLOG_TRACE("Scheduling {}/{} of {} on {} (unregistered)", + SPDLOG_DEBUG("Scheduling {}/{} of {} on {} (unregistered)", nOnThisHost, nMessages, funcStr, From 3f014dc0bfa5c329d86c64b5356f1b5460d45777 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 22:08:06 +0000 Subject: [PATCH 20/84] Remove isMetricsRequest field from MessageRecord and faabric.proto --- include/faabric/proto/faabric.pb.h | 4 +--- src/proto/faabric.proto | 1 - src/scheduler/Scheduler.cpp | 10 +++------- src/util/json.cpp | 5 ----- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/include/faabric/proto/faabric.pb.h b/include/faabric/proto/faabric.pb.h index 0a6dfa8f1..e58144c6a 100644 --- a/include/faabric/proto/faabric.pb.h +++ b/include/faabric/proto/faabric.pb.h @@ -53,7 +53,6 @@ struct MessageRecord final std::string pythonFunction; std::string cmdline; bool forbidNdp = false; - bool isMetricsRequest = false; MessageRecord() = default; MessageRecord(const faabric::Message& msg) @@ -64,8 +63,7 @@ struct MessageRecord final , pythonUser(msg.pythonuser()) , pythonFunction(msg.pythonfunction()) , cmdline(msg.cmdline()) - , forbidNdp(msg.forbidndp()), - , isMetricsRequest(msg.ismetricsrequest()) + , forbidNdp(msg.forbidndp()) { } ~MessageRecord() = default; diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index d89a33b05..f85af23ed 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -183,7 +183,6 @@ message Message { optional bool forbidNdp = 95; repeated int32 extraArguments = 96; string ndpCallObjectName = 97; - optional bool isMetricsRequest = 98; } message DirectResultTransmission { diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index f59f93d04..a603c92d9 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -549,13 +549,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // First try and do so on already registered hosts. int remainder = nMessages - nLocally; if (!hostKindDifferent && remainder > 0) { - SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", - remainder, - nMessages, - funcStr); - const std::set& thisRegisteredHosts = - getFunctionRegisteredHosts( - firstMsg.user(), firstMsg.function(), false); + SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); + + const std::set& thisRegisteredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); for (const auto& h : thisRegisteredHosts) { // Work out resources on the remote host diff --git a/src/util/json.cpp b/src/util/json.cpp index 211995a3f..ccd4777ca 100644 --- a/src/util/json.cpp +++ b/src/util/json.cpp @@ -223,10 +223,6 @@ std::string messageToJson(const faabric::Message& msg) d.AddMember("forbid_ndp", Value(msg.forbidndp()), a); } - if (msg.isMetricsRequest()) { - d.AddMember("is_metrics_request", Value(msg.isMetricsRequest()), a); - } - StringBuffer sb; Writer writer(sb); d.Accept(writer); @@ -439,7 +435,6 @@ faabric::Message jsonToMessage(const std::string& jsonIn) getBoolFromJson(d, "is_output_memory_delta", false)); msg.set_directresulthost(getStringFromJson(d, "direct_result_host", "")); msg.set_forbidndp(getBoolFromJson(d, "forbid_ndp", false)); - msg.set_ismetricsrequest(getBoolFromJson(d, "isMetricsRequest", false)); PROF_END(jsonDecode) From 02d7472bd70b848400aa108c0159c00b9c0699ff Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:21:50 +0000 Subject: [PATCH 21/84] Refactored the faasm round-robin implementation to a policy class --- .../faabric/loadbalance/LoadBalancePolicy.h | 12 +++--- src/loadbalance/FaasmDefaultPolicy.cpp | 22 +++++++++- src/scheduler/Scheduler.cpp | 43 +++---------------- 3 files changed, 34 insertions(+), 43 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index 3e6826d40..74a9d3c86 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -1,28 +1,30 @@ #pragma once #include -#include +#include +#include +#include class LoadBalancePolicy { public: - virtual std::string dispatch(const std::set& warm_faaslets) = 0; + virtual std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::string dispatch(const std::set& warm_faaslets) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::string dispatch(const std::set& warm_faaslets) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::string dispatch(const std::set& warm_faaslets) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index 6e695d24c..c21f7c91d 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,7 +1,25 @@ #include #include -std::string FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets) +std::vector FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages) { - throw std::runtime_error("FaasmDefaultPolicy::dispatch not implemented"); + std::vector hosts_delta; + hosts_delta.reserve(number_of_messages); + for (auto& faaslet : warm_faaslets) { + int remainder = number_of_messages; + faabric::HostResources host_resources = faabric::util::getHostResources(faaslet); + int availableSlots = host_resources.slots() - host_resources.usedslots(); + availableSlots = std::max(availableSlots, 0); + int nOnThisHost = std::min(availableSlots, 1); + + for (int i = 0; i < nOnThisHost; i++) { + hosts_delta.push_back(faaslet); + } + + remainder -= nOnThisHost; + if (remainder <= 0) { + break; + } + } + return hosts_delta; } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index a603c92d9..80778d45d 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -440,7 +440,7 @@ faabric::util::SchedulingDecision Scheduler::makeSchedulingDecision( } faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( - std::shared_ptr req, + std::shared_ptr req faabric::util::SchedulingTopologyHint topologyHint) { ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); @@ -550,43 +550,14 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( int remainder = nMessages - nLocally; if (!hostKindDifferent && remainder > 0) { SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - + + FaasmDefaultPolicy policy; const std::set& thisRegisteredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); + std::vector balanced_order = policy.dispatch(thisRegisteredHosts, remainder); - for (const auto& h : thisRegisteredHosts) { - // Work out resources on the remote host - SPDLOG_DEBUG("Checking {} for resources", h); - faabric::HostResources r = getHostResources(h); - int available = r.slots() - r.usedslots(); - - // We need to floor at zero here in case the remote host is - // overloaded, in which case its used slots will be greater than - // its available slots. - available = std::max(0, available); - int nOnThisHost = std::min(available, remainder); - - // Under the NEVER_ALONE topology hint, we never choose a host - // unless we can schedule at least two requests in it. - if (topologyHint == - faabric::util::SchedulingTopologyHint::NEVER_ALONE && - nOnThisHost < 2) { - continue; - } - - SPDLOG_TRACE("Scheduling {}/{} of {} on {} (registered)", - nOnThisHost, - nMessages, - funcStr, - h); - - for (int i = 0; i < nOnThisHost; i++) { - hosts.push_back(h); - } - - remainder -= nOnThisHost; - if (remainder <= 0) { - break; - } + remainder -= balanced_order.size(); + for (const auto& h : balanced_order) { + hosts.push_back(h); } } From f6cef20300028502cfb7ed8871c09dca4eab6a9e Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:26:10 +0000 Subject: [PATCH 22/84] fixes removed comma --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 80778d45d..3a986cdf3 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -440,7 +440,7 @@ faabric::util::SchedulingDecision Scheduler::makeSchedulingDecision( } faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( - std::shared_ptr req + std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); From eaeb790b7b5c0aa7ec5c87b997da6dec4f5202ed Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:29:24 +0000 Subject: [PATCH 23/84] added include directo --- CMakeLists.txt | 2 ++ src/scheduler/Scheduler.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c9d98321..2c61d92b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ add_subdirectory(src/mpi) add_subdirectory(src/proto) add_subdirectory(src/redis) add_subdirectory(src/runner) +add_subdirectory(src/loadbalance) add_subdirectory(src/scheduler) add_subdirectory(src/snapshot) add_subdirectory(src/state) @@ -130,6 +131,7 @@ add_library(faabric $ $ $ + $ $ $ ) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 3a986cdf3..6fee1866b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include From 21f87e10e464d06bf94ea17a24234dc054536010 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:33:07 +0000 Subject: [PATCH 24/84] Adds CMAKE to loadbalance --- src/loadbalance/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 src/loadbalance/CMakeLists.txt diff --git a/src/loadbalance/CMakeLists.txt b/src/loadbalance/CMakeLists.txt new file mode 100644 index 000000000..4d0392361 --- /dev/null +++ b/src/loadbalance/CMakeLists.txt @@ -0,0 +1,7 @@ +faabric_lib(loadbalance + FaasmDefaultPolicy.cpp + LeastLoadAveragePolicy.cpp + MostSlotsPolicy.cpp) + + +target_link_libraries(loadbalance PRIVATE faabric::scheduler) \ No newline at end of file From 0376aa40f0385e03fc33774227c21ed7a9d8b540 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:40:34 +0000 Subject: [PATCH 25/84] Update LoadBalancePolicy dispatch method signature --- include/faabric/loadbalance/LoadBalancePolicy.h | 8 ++++---- src/loadbalance/FaasmDefaultPolicy.cpp | 4 ++-- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index 74a9d3c86..75da1aebb 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -8,23 +8,23 @@ class LoadBalancePolicy { public: - virtual std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) = 0; + virtual std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index c21f7c91d..562aca51e 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,13 +1,13 @@ #include #include -std::vector FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages) +std::vector FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) { std::vector hosts_delta; hosts_delta.reserve(number_of_messages); for (auto& faaslet : warm_faaslets) { int remainder = number_of_messages; - faabric::HostResources host_resources = faabric::util::getHostResources(faaslet); + faabric::HostResources host_resources = scheduler.getResourcesForHost(faaslet); int availableSlots = host_resources.slots() - host_resources.usedslots(); availableSlots = std::max(availableSlots, 0); int nOnThisHost = std::min(availableSlots, 1); diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index b0bbb5cbe..38c87eee9 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::string LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets) +std::string LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index e8e193567..3b5f59447 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::string MostSlotsPolicy::dispatch(const std::set& warm_faaslets) +std::string MostSlotsPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) { throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); } \ No newline at end of file From a268b3e0eb9e002e7dffea47e99f5bb6516c6bec Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:42:43 +0000 Subject: [PATCH 26/84] Update load balance policies to return vector of strings instead of a single string --- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 2 +- src/scheduler/Scheduler.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index 38c87eee9..c1aa8ed39 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::string LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) +std::vector LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index 3b5f59447..f1814ef39 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::string MostSlotsPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) +std::vector MostSlotsPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) { throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6fee1866b..07f98780e 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -554,7 +554,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( FaasmDefaultPolicy policy; const std::set& thisRegisteredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); - std::vector balanced_order = policy.dispatch(thisRegisteredHosts, remainder); + std::vector balanced_order = policy.dispatch(thisRegisteredHosts, remainder, this); // Messy DI to get access to methods remainder -= balanced_order.size(); for (const auto& h : balanced_order) { From 1794bcf56e74c6be5f33cec34789fd25e637d3b0 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:45:34 +0000 Subject: [PATCH 27/84] Update LoadBalancePolicy dispatch method signatures --- include/faabric/loadbalance/LoadBalancePolicy.h | 8 ++++---- src/loadbalance/FaasmDefaultPolicy.cpp | 2 +- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 2 +- src/scheduler/Scheduler.cpp | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index 75da1aebb..ac618787a 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -8,23 +8,23 @@ class LoadBalancePolicy { public: - virtual std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) = 0; + virtual std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) override; + std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index 562aca51e..7663eb5dd 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::vector FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) +std::vector FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) { std::vector hosts_delta; hosts_delta.reserve(number_of_messages); diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index c1aa8ed39..ac43784e2 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::vector LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) +std::vector LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index f1814ef39..0ebc5f2a8 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::vector MostSlotsPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, faabric::scheduler::Scheduler& scheduler) +std::vector MostSlotsPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) { throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 07f98780e..a250c3ef0 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -554,7 +554,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( FaasmDefaultPolicy policy; const std::set& thisRegisteredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); - std::vector balanced_order = policy.dispatch(thisRegisteredHosts, remainder, this); // Messy DI to get access to methods + std::vector balanced_order = policy.dispatch(thisRegisteredHosts, remainder, *this); // Messy DI to get access to methods remainder -= balanced_order.size(); for (const auto& h : balanced_order) { From 5c914d2da35c74d2c12679fd5b3cb286dad28123 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Wed, 14 Feb 2024 23:46:05 +0000 Subject: [PATCH 28/84] fixed wrong method call --- src/loadbalance/FaasmDefaultPolicy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index 7663eb5dd..68d191fac 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -7,7 +7,7 @@ std::vector FaasmDefaultPolicy::dispatch(const std::set(availableSlots, 1); From faf639d1744bee25269364a8475dc782cc8b8374 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 00:45:43 +0000 Subject: [PATCH 29/84] Refactor code for improved performance and readability --- .../faabric/loadbalance/LoadBalancePolicy.h | 8 ++-- src/loadbalance/FaasmDefaultPolicy.cpp | 22 +--------- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 2 +- src/scheduler/Scheduler.cpp | 43 ++++++++++++++++--- 5 files changed, 46 insertions(+), 31 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index ac618787a..b832a71ad 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -8,23 +8,23 @@ class LoadBalancePolicy { public: - virtual std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) = 0; + virtual std::set dispatch(std::set hosts, std::vector host_resources) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) override; + std::set dispatch(std::set hosts, std::vector host_resources) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) override; + std::set dispatch(std::set hosts, std::vector host_resources) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::vector dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) override; + std::set dispatch(std::set hosts, std::vector host_resources) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index 68d191fac..9f235ff72 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,25 +1,7 @@ #include #include -std::vector FaasmDefaultPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) +std::set FaasmDefaultPolicy::dispatch(std::set hosts, std::vector host_resources) { - std::vector hosts_delta; - hosts_delta.reserve(number_of_messages); - for (auto& faaslet : warm_faaslets) { - int remainder = number_of_messages; - faabric::HostResources host_resources = scheduler.getHostResources(faaslet); - int availableSlots = host_resources.slots() - host_resources.usedslots(); - availableSlots = std::max(availableSlots, 0); - int nOnThisHost = std::min(availableSlots, 1); - - for (int i = 0; i < nOnThisHost; i++) { - hosts_delta.push_back(faaslet); - } - - remainder -= nOnThisHost; - if (remainder <= 0) { - break; - } - } - return hosts_delta; + return hosts; // Simply return the hosts in the order they were given (round-robin) } \ No newline at end of file diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index ac43784e2..0df6db562 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::vector LeastLoadAveragePolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) +std::set LeastLoadAveragePolicy::dispatch(std::set hosts, std::vector host_resources) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index 0ebc5f2a8..eb18ff325 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::vector MostSlotsPolicy::dispatch(const std::set& warm_faaslets, int number_of_messages, const faabric::scheduler::Scheduler& scheduler) +std::set MostSlotsPolicy::dispatch(std::set hosts, std::vector host_resources) { throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index a250c3ef0..0ba499238 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -552,13 +552,46 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( if (!hostKindDifferent && remainder > 0) { SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - FaasmDefaultPolicy policy; + bool use_faasm_default_policy = true; const std::set& thisRegisteredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); - std::vector balanced_order = policy.dispatch(thisRegisteredHosts, remainder, *this); // Messy DI to get access to methods - remainder -= balanced_order.size(); - for (const auto& h : balanced_order) { - hosts.push_back(h); + std::vector hostResources; + for (std::string h : thisRegisteredHosts) { + hostResources.push_back(getHostResources(h)); + } + + // Reorder thisRegistered based on policy selected + FaasmDefaultPolicy policy; + std::set balancedRegisteredHosts = policy.dispatch(thisRegisteredHosts, hostResources); + + for (const auto& h : balancedRegisteredHosts) { + // Work out resources on the remote host + faabric::HostResources r = getHostResources(h); + int available = r.slots() - r.usedslots(); + // We need to floor at zero here in case the remote host is + // overloaded, in which case its used slots will be greater than + // its available slots. + available = std::max(0, available); + int nOnThisHost = std::min(available, remainder); + // Under the NEVER_ALONE topology hint, we never choose a host + // unless we can schedule at least two requests in it. + if (topologyHint == + faabric::util::SchedulingTopologyHint::NEVER_ALONE && + nOnThisHost < 2) { + continue; + } + SPDLOG_TRACE("Scheduling {}/{} of {} on {} (registered)", + nOnThisHost, + nMessages, + funcStr, + h); + for (int i = 0; i < nOnThisHost; i++) { + hosts.push_back(h); + } + remainder -= nOnThisHost; + if (remainder <= 0) { + break; + } } } From 1a827195870e76983d4b9460f84fc042376e90d6 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 00:55:19 +0000 Subject: [PATCH 30/84] Refactor load balancing and scheduling code --- src/loadbalance/FaasmDefaultPolicy.cpp | 2 +- src/scheduler/Scheduler.cpp | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index 9f235ff72..046c4fb9b 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::set FaasmDefaultPolicy::dispatch(std::set hosts, std::vector host_resources) +std::map FaasmDefaultPolicy::dispatch(std::map hosts) { return hosts; // Simply return the hosts in the order they were given (round-robin) } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 0ba499238..a6c6843e4 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -553,21 +553,17 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); bool use_faasm_default_policy = true; - const std::set& thisRegisteredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); - - std::vector hostResources; - for (std::string h : thisRegisteredHosts) { - hostResources.push_back(getHostResources(h)); + std::map hosts_map; + for (std::string h : getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false)) { + hosts_map[h] = getHostResources(h); } - // Reorder thisRegistered based on policy selected FaasmDefaultPolicy policy; - std::set balancedRegisteredHosts = policy.dispatch(thisRegisteredHosts, hostResources); + policy.dispatch(hosts_map); - for (const auto& h : balancedRegisteredHosts) { + for (const auto& [host, resources] : balancedRegisteredHosts) { // Work out resources on the remote host - faabric::HostResources r = getHostResources(h); - int available = r.slots() - r.usedslots(); + int available = resources.slots() - resources.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than // its available slots. From cdb0e70e73663c4d875a0f45defc8747d1929b41 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 00:59:22 +0000 Subject: [PATCH 31/84] Refactor load balance policy dispatch methods to use std::map --- include/faabric/loadbalance/LoadBalancePolicy.h | 8 ++++---- src/loadbalance/FaasmDefaultPolicy.cpp | 4 ++-- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 2 +- src/scheduler/Scheduler.cpp | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index b832a71ad..fe53cde5e 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -8,23 +8,23 @@ class LoadBalancePolicy { public: - virtual std::set dispatch(std::set hosts, std::vector host_resources) = 0; + virtual std::map dispatch(std::map host_resources) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::set dispatch(std::set hosts, std::vector host_resources) override; + std::map dispatch(std::map host_resources) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::set dispatch(std::set hosts, std::vector host_resources) override; + std::map dispatch(std::map host_resources) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::set dispatch(std::set hosts, std::vector host_resources) override; + std::map dispatch(std::map host_resources) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index 046c4fb9b..e2720543e 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::map FaasmDefaultPolicy::dispatch(std::map hosts) +std::map FaasmDefaultPolicy::dispatch(std::map host_resources) { - return hosts; // Simply return the hosts in the order they were given (round-robin) + return host_resources; // Simply return the hosts in the order they were given (round-robin) } \ No newline at end of file diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index 0df6db562..79b343f5b 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::set LeastLoadAveragePolicy::dispatch(std::set hosts, std::vector host_resources) +std::map LeastLoadAveragePolicy::dispatch(std::map host_resources) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index eb18ff325..59b61fe7f 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::set MostSlotsPolicy::dispatch(std::set hosts, std::vector host_resources) +std::map MostSlotsPolicy::dispatch(std::map host_resources) { throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index a6c6843e4..6bb63b186 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -582,7 +582,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( funcStr, h); for (int i = 0; i < nOnThisHost; i++) { - hosts.push_back(h); + hosts.push_back(host); } remainder -= nOnThisHost; if (remainder <= 0) { From efbec43b1879c007c5d5d7984b7569989dcfd2f5 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 01:01:19 +0000 Subject: [PATCH 32/84] fix --- src/scheduler/Scheduler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6bb63b186..6ccb1e55b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -561,7 +561,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( FaasmDefaultPolicy policy; policy.dispatch(hosts_map); - for (const auto& [host, resources] : balancedRegisteredHosts) { + for (const auto& [host, resources] : hosts_map) { // Work out resources on the remote host int available = resources.slots() - resources.usedslots(); // We need to floor at zero here in case the remote host is @@ -580,7 +580,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( nOnThisHost, nMessages, funcStr, - h); + host); for (int i = 0; i < nOnThisHost; i++) { hosts.push_back(host); } From 343c59f2dc9727ff7a75973508fdaaaa34862759 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 01:40:11 +0000 Subject: [PATCH 33/84] Update LoadBalancePolicy dispatch function signatures --- include/faabric/loadbalance/LoadBalancePolicy.h | 8 ++++---- src/loadbalance/FaasmDefaultPolicy.cpp | 2 +- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 9 ++++++++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index fe53cde5e..7c888e1ff 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -8,23 +8,23 @@ class LoadBalancePolicy { public: - virtual std::map dispatch(std::map host_resources) = 0; + virtual std::map dispatch(std::map& host_resources) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::map dispatch(std::map host_resources) override; + std::map dispatch(std::map& host_resources) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::map dispatch(std::map host_resources) override; + std::map dispatch(std::map& host_resources) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::map dispatch(std::map host_resources) override; + std::map dispatch(std::map& host_resources) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index e2720543e..be3118374 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::map FaasmDefaultPolicy::dispatch(std::map host_resources) +std::map FaasmDefaultPolicy::dispatch(std::map& host_resources) { return host_resources; // Simply return the hosts in the order they were given (round-robin) } \ No newline at end of file diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index 79b343f5b..10ec4599a 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::map LeastLoadAveragePolicy::dispatch(std::map host_resources) +std::map LeastLoadAveragePolicy::dispatch(std::map& host_resources) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index 59b61fe7f..3c2246d97 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,7 +1,14 @@ #include #include -std::map MostSlotsPolicy::dispatch(std::map host_resources) +std::map MostSlotsPolicy::dispatch(std::map& host_resources) { throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); + + // Sort map in-place by available slots descending + std::sort(host_resources.begin(), host_resources.end(), [](const auto &a, const auto &b) { + int available_a = a.second.slots() - a.second.usedslots(); + int available_b = b.second.slots() - b.second.usedslots(); + return available_a > available_b; + }); } \ No newline at end of file From e5a364572851690ad70cdbb1713c464abb4a7850 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 01:40:55 +0000 Subject: [PATCH 34/84] Update scheduler policy to MostSlotsPolicy --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6ccb1e55b..b21650069 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -558,7 +558,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( hosts_map[h] = getHostResources(h); } - FaasmDefaultPolicy policy; + MostSlotsPolicy policy; policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { From 5ce849a8e3352f34d2a30310994a4afc235029a3 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 01:46:25 +0000 Subject: [PATCH 35/84] lol --- src/loadbalance/MostSlotsPolicy.cpp | 15 ++++++++++++--- src/scheduler/Scheduler.cpp | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index 3c2246d97..56ce814c4 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -3,12 +3,21 @@ std::map MostSlotsPolicy::dispatch(std::map& host_resources) { - throw std::runtime_error("MostSlotsPolicy::dispatch not implemented"); + std::vector> vec(host_resources.begin(), host_resources.end()); - // Sort map in-place by available slots descending - std::sort(host_resources.begin(), host_resources.end(), [](const auto &a, const auto &b) { + // Sort the vector by the number of available slots in descending order + std::sort(vec.begin(), vec.end(), [](const auto &a, const auto &b) { int available_a = a.second.slots() - a.second.usedslots(); int available_b = b.second.slots() - b.second.usedslots(); return available_a > available_b; }); + + // Convert the vector back to a map + std::map sorted_hosts; + for (const auto &pair : vec) + { + sorted_hosts[pair.first] = pair.second; + } + + return sorted_hosts; } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index b21650069..5fed101bf 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -559,7 +559,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( } MostSlotsPolicy policy; - policy.dispatch(hosts_map); + hosts_map = policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { // Work out resources on the remote host From cae87bc75997672e052344a4ab28d13a229271f0 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 01:57:32 +0000 Subject: [PATCH 36/84] tings working --- src/scheduler/Scheduler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 5fed101bf..6ab6fdefb 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -551,14 +551,14 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( int remainder = nMessages - nLocally; if (!hostKindDifferent && remainder > 0) { SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - - bool use_faasm_default_policy = true; + std::map hosts_map; for (std::string h : getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false)) { hosts_map[h] = getHostResources(h); } - MostSlotsPolicy policy; + // MostSlotsPolicy policy; + FaasmDefaultPolicy policy; hosts_map = policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { From 17e135426bab9331c41634b858795958acb845b5 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 14:52:34 +0000 Subject: [PATCH 37/84] Refactor scheduling policy in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6ab6fdefb..2b9d760f1 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -444,6 +444,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { + FaasmDefaultPolicy policy; ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); int nMessages = req->messages_size(); const faabric::Message& firstMsg = req->messages().at(0); @@ -558,7 +559,6 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( } // MostSlotsPolicy policy; - FaasmDefaultPolicy policy; hosts_map = policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { @@ -595,16 +595,23 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::string lastHost; if (remainder > 0) { std::vector unregisteredHosts; + std::map hosts_map; + if (hostKindDifferent) { for (auto&& h : getAvailableHostsForFunction(firstMsg)) { - unregisteredHosts.push_back(std::move(h)); + hosts_map[h] = getHostResources(h); } } else { unregisteredHosts = getUnregisteredHosts(firstMsg.user(), firstMsg.function()); - } - for (const auto& h : unregisteredHosts) { + for (auto&& h : unregisteredHosts) { + hosts_map[h] = getHostResources(h); + } + } + + hosts_map = policy.dispatch(hosts_map); + for (const auto& [h, r] : hosts_map) { // Skip if this host if (h == thisHost) { continue; @@ -614,8 +621,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // Work out resources on the remote host SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); SPDLOG_DEBUG("Remaining: {}", remainder); - - faabric::HostResources r = getHostResources(h); + int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than From 033b89e152d2ca3b3c1a66e55b64e9f8f200b2aa Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 15:15:04 +0000 Subject: [PATCH 38/84] changes to MostSlots policy --- src/scheduler/Scheduler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 2b9d760f1..c31482b79 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -444,7 +444,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { - FaasmDefaultPolicy policy; + MostSlotsPolicy policy; ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); int nMessages = req->messages_size(); const faabric::Message& firstMsg = req->messages().at(0); @@ -621,7 +621,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // Work out resources on the remote host SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); SPDLOG_DEBUG("Remaining: {}", remainder); - + int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than From 6cab51eaab04218e1681c19cee020688567aacf3 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 15:22:08 +0000 Subject: [PATCH 39/84] changes to MostSlots --- src/scheduler/Scheduler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index c31482b79..6ff42b090 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -562,6 +562,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( hosts_map = policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { + print("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host int available = resources.slots() - resources.usedslots(); // We need to floor at zero here in case the remote host is From c2f5c6a0ca98611623294fb7fde2616f06c71765 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 15:24:06 +0000 Subject: [PATCH 40/84] removed print to SPDLOG_INFO2 --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6ff42b090..037d6a09d 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -562,7 +562,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( hosts_map = policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { - print("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); + SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host int available = resources.slots() - resources.usedslots(); // We need to floor at zero here in case the remote host is From 6cb947a7d05674f8ba63fdf685e54cc44da457ec Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 15:37:49 +0000 Subject: [PATCH 41/84] Reorder registered hosts based on LoadBalancePolicy in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 037d6a09d..7ab2eaa4b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -559,6 +559,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( } // MostSlotsPolicy policy; + SPDLOG_DEBUG("Reordering registered hosts based on LoadBalancePolicy"); hosts_map = policy.dispatch(hosts_map); for (const auto& [host, resources] : hosts_map) { From 5f4461390b20fa2552a70487291de956976bafc7 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 15:52:05 +0000 Subject: [PATCH 42/84] Refactor load balancing and scheduling --- src/loadbalance/MostSlotsPolicy.cpp | 3 ++- src/scheduler/Scheduler.cpp | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index 56ce814c4..206cde91c 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -4,7 +4,7 @@ std::map MostSlotsPolicy::dispatch(std::map& host_resources) { std::vector> vec(host_resources.begin(), host_resources.end()); - + // Sort the vector by the number of available slots in descending order std::sort(vec.begin(), vec.end(), [](const auto &a, const auto &b) { int available_a = a.second.slots() - a.second.usedslots(); @@ -12,6 +12,7 @@ std::map MostSlotsPolicy::dispatch(std::map return available_a > available_b; }); + // Convert the vector back to a map std::map sorted_hosts; for (const auto &pair : vec) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7ab2eaa4b..3308a9b62 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -560,9 +560,10 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // MostSlotsPolicy policy; SPDLOG_DEBUG("Reordering registered hosts based on LoadBalancePolicy"); - hosts_map = policy.dispatch(hosts_map); + std::map sorted_map = policy.dispatch(hosts_map); + SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); - for (const auto& [host, resources] : hosts_map) { + for (const auto& [host, resources] : sorted_map) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host int available = resources.slots() - resources.usedslots(); @@ -612,8 +613,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( } } - hosts_map = policy.dispatch(hosts_map); - for (const auto& [h, r] : hosts_map) { + std::map sorted_map = policy.dispatch(hosts_map); + for (const auto& [h, r] : sorted_map) { // Skip if this host if (h == thisHost) { continue; From 38399841b94ed0f2e7f75a4ea1b4e2834755ab10 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:00:26 +0000 Subject: [PATCH 43/84] Add print statement to display the size of the registered hosts map --- src/scheduler/Scheduler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 3308a9b62..dcd3c2abb 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -562,7 +562,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Reordering registered hosts based on LoadBalancePolicy"); std::map sorted_map = policy.dispatch(hosts_map); SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); - + print("Registered Hosts map size: {}", sorted_map.size()); + for (const auto& [host, resources] : sorted_map) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host From 2ab51babb22e1ceef2132c346dc8923c2a3a86bc Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:11:05 +0000 Subject: [PATCH 44/84] Refactor load balancing policies to use vectors instead of maps --- include/faabric/loadbalance/LoadBalancePolicy.h | 8 ++++---- src/loadbalance/FaasmDefaultPolicy.cpp | 4 ++-- src/loadbalance/LeastLoadAveragePolicy.cpp | 2 +- src/loadbalance/MostSlotsPolicy.cpp | 16 +++------------- src/scheduler/Scheduler.cpp | 6 +++--- 5 files changed, 13 insertions(+), 23 deletions(-) diff --git a/include/faabric/loadbalance/LoadBalancePolicy.h b/include/faabric/loadbalance/LoadBalancePolicy.h index 7c888e1ff..47f482e85 100644 --- a/include/faabric/loadbalance/LoadBalancePolicy.h +++ b/include/faabric/loadbalance/LoadBalancePolicy.h @@ -8,23 +8,23 @@ class LoadBalancePolicy { public: - virtual std::map dispatch(std::map& host_resources) = 0; + virtual std::vector> dispatch(std::vector>& host_resources) = 0; }; class FaasmDefaultPolicy : public LoadBalancePolicy { public: - std::map dispatch(std::map& host_resources) override; + std::vector> dispatch(std::vector>& host_resources) override; }; class LeastLoadAveragePolicy : public LoadBalancePolicy { public: - std::map dispatch(std::map& host_resources) override; + std::vector> dispatch(std::vector>& host_resources) override; }; class MostSlotsPolicy : public LoadBalancePolicy { public: - std::map dispatch(std::map& host_resources) override; + std::vector> dispatch(std::vector>& host_resources) override; }; \ No newline at end of file diff --git a/src/loadbalance/FaasmDefaultPolicy.cpp b/src/loadbalance/FaasmDefaultPolicy.cpp index be3118374..0853325e3 100644 --- a/src/loadbalance/FaasmDefaultPolicy.cpp +++ b/src/loadbalance/FaasmDefaultPolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::map FaasmDefaultPolicy::dispatch(std::map& host_resources) +std::vector> FaasmDefaultPolicy::dispatch(std::vector>& host_resources) { - return host_resources; // Simply return the hosts in the order they were given (round-robin) + return host_resources; } \ No newline at end of file diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index 10ec4599a..d948c99a4 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -1,7 +1,7 @@ #include #include -std::map LeastLoadAveragePolicy::dispatch(std::map& host_resources) +std::vector> LeastLoadAveragePolicy::dispatch(std::vector>& host_resources) { throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); } \ No newline at end of file diff --git a/src/loadbalance/MostSlotsPolicy.cpp b/src/loadbalance/MostSlotsPolicy.cpp index 206cde91c..e4d38e8ef 100644 --- a/src/loadbalance/MostSlotsPolicy.cpp +++ b/src/loadbalance/MostSlotsPolicy.cpp @@ -1,24 +1,14 @@ #include #include -std::map MostSlotsPolicy::dispatch(std::map& host_resources) +std::vector> MostSlotsPolicy::dispatch(std::vector>& host_resources) { - std::vector> vec(host_resources.begin(), host_resources.end()); - // Sort the vector by the number of available slots in descending order - std::sort(vec.begin(), vec.end(), [](const auto &a, const auto &b) { + std::sort(host_resources.begin(), host_resources.end(), [](const auto &a, const auto &b) { int available_a = a.second.slots() - a.second.usedslots(); int available_b = b.second.slots() - b.second.usedslots(); return available_a > available_b; }); - - // Convert the vector back to a map - std::map sorted_hosts; - for (const auto &pair : vec) - { - sorted_hosts[pair.first] = pair.second; - } - - return sorted_hosts; + return host_resources; } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index dcd3c2abb..50627d615 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -553,9 +553,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( if (!hostKindDifferent && remainder > 0) { SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - std::map hosts_map; + std::vector> host_resources_pairs; for (std::string h : getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false)) { - hosts_map[h] = getHostResources(h); + host_resources_pairs.push_back({h, getHostResources(h)}); } // MostSlotsPolicy policy; @@ -563,7 +563,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::map sorted_map = policy.dispatch(hosts_map); SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); print("Registered Hosts map size: {}", sorted_map.size()); - + for (const auto& [host, resources] : sorted_map) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host From cca1fe329f6f7bedc82561a564f8781e52d03e40 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:13:33 +0000 Subject: [PATCH 45/84] Refactor host resources handling in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 50627d615..7c31e86bd 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -599,23 +599,23 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::string lastHost; if (remainder > 0) { std::vector unregisteredHosts; - std::map hosts_map; + std::vector> host_resources_pairs; if (hostKindDifferent) { for (auto&& h : getAvailableHostsForFunction(firstMsg)) { - hosts_map[h] = getHostResources(h); + host_resources_pairs.push_back({h, getHostResources(h)}); } } else { unregisteredHosts = getUnregisteredHosts(firstMsg.user(), firstMsg.function()); for (auto&& h : unregisteredHosts) { - hosts_map[h] = getHostResources(h); + host_resources_pairs.push_back({h, getHostResources(h)}); } } - std::map sorted_map = policy.dispatch(hosts_map); - for (const auto& [h, r] : sorted_map) { + policy.dispatch(host_resources_pairs); + for (const auto& [h, r] : host_resources_pairs) { // Skip if this host if (h == thisHost) { continue; From 967ab3b6cbd048b1cb1b75096add4f9d49c6bd63 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:15:19 +0000 Subject: [PATCH 46/84] Refactor host sorting in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7c31e86bd..3c219b2e0 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -560,9 +560,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // MostSlotsPolicy policy; SPDLOG_DEBUG("Reordering registered hosts based on LoadBalancePolicy"); - std::map sorted_map = policy.dispatch(hosts_map); + policy.dispatch(host_resources_pairs); SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); - print("Registered Hosts map size: {}", sorted_map.size()); + print("Registered Hosts map size: {}", host_resources_pairs.size()); for (const auto& [host, resources] : sorted_map) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); From 2dbc189deb39a2755012aaf64775b2818a624d90 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:39:54 +0000 Subject: [PATCH 47/84] Refactor host iteration in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 3c219b2e0..a2c52c38c 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -564,7 +564,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); print("Registered Hosts map size: {}", host_resources_pairs.size()); - for (const auto& [host, resources] : sorted_map) { + for (const auto& [host, resources] : host_resources_pairs) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host int available = resources.slots() - resources.usedslots(); From ea16c5f5a41ebb3fd4ec452a3557abf36c2d41de Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:41:14 +0000 Subject: [PATCH 48/84] removed print to SPDLOG --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index a2c52c38c..ecae1721a 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -562,7 +562,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Reordering registered hosts based on LoadBalancePolicy"); policy.dispatch(host_resources_pairs); SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); - print("Registered Hosts map size: {}", host_resources_pairs.size()); + SPDLOG_DEBUG("Registered Hosts map size: {}", host_resources_pairs.size()); for (const auto& [host, resources] : host_resources_pairs) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); From c7261a136edc73b8b9d01f7c8baa92431ddca8a9 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 16:57:43 +0000 Subject: [PATCH 49/84] Refactor host registration in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index ecae1721a..7d2c7c9ee 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -552,10 +552,11 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( int remainder = nMessages - nLocally; if (!hostKindDifferent && remainder > 0) { SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - + + const auto& registeredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); std::vector> host_resources_pairs; - for (std::string h : getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false)) { - host_resources_pairs.push_back({h, getHostResources(h)}); + for (std::string h : registeredHosts) { + host_resources_pairs.push_back(std::make_pair(h, getHostResources(h))); } // MostSlotsPolicy policy; @@ -614,7 +615,11 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( } } + SPDLOG_DEBUG("Reordering unregistered hosts based on LoadBalancePolicy"); policy.dispatch(host_resources_pairs); + SPDLOG_DEBUG("Reordered unregistered hosts based on LoadBalancePolicy"); + SPDLOG_DEBUG("Unregistered Hosts map size: {}", host_resources_pairs.size()); + for (const auto& [h, r] : host_resources_pairs) { // Skip if this host if (h == thisHost) { From 880918c152e612d07ab960cad9883f1d0c18f5b0 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 18:01:04 +0000 Subject: [PATCH 50/84] Update scheduler policy to FaasmDefaultPolicy --- src/scheduler/Scheduler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 7d2c7c9ee..0d133ef92 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -444,7 +444,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { - MostSlotsPolicy policy; + // MostSlotsPolicy policy; + FaasmDefaultPolicy policy; ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); int nMessages = req->messages_size(); const faabric::Message& firstMsg = req->messages().at(0); From 2404a74b4dc1c3e33da489e38fa20abe0408f296 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 18:27:13 +0000 Subject: [PATCH 51/84] Remove unnecessary shutdown call and add debug logging for registered hosts --- src/scheduler/Scheduler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 0d133ef92..16bd3ee1f 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -104,8 +104,6 @@ Scheduler::Scheduler() Scheduler::~Scheduler() { - - shutdown(); if (!_isShutdown) { SPDLOG_ERROR("Destructing scheduler without shutting down first"); } @@ -554,7 +552,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( if (!hostKindDifferent && remainder > 0) { SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - const auto& registeredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); + const std::set& registeredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); + SPDLOG_DEBUG("Number of registered hosts: {}", registeredHosts.size()); + std::vector> host_resources_pairs; for (std::string h : registeredHosts) { host_resources_pairs.push_back(std::make_pair(h, getHostResources(h))); From b7cd62b6243e109382ab6560cfd2f858ff105321 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 18:36:16 +0000 Subject: [PATCH 52/84] Fix resource calculation in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 16bd3ee1f..5c2afe29c 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -568,8 +568,10 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( for (const auto& [host, resources] : host_resources_pairs) { SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); + // Work out resources on the remote host - int available = resources.slots() - resources.usedslots(); + faabric::HostResources& r = getHostResources(h); + int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than // its available slots. @@ -621,7 +623,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Reordered unregistered hosts based on LoadBalancePolicy"); SPDLOG_DEBUG("Unregistered Hosts map size: {}", host_resources_pairs.size()); - for (const auto& [h, r] : host_resources_pairs) { + for (const auto& [h, resource] : host_resources_pairs) { // Skip if this host if (h == thisHost) { continue; @@ -632,6 +634,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); SPDLOG_DEBUG("Remaining: {}", remainder); + faabric::HostResources& r = getHostResources(h); // Get up to date info + int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than From 0cfd96e1961ef428303ddcdf6c34a080eac28b11 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 18:38:05 +0000 Subject: [PATCH 53/84] test --- src/scheduler/Scheduler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 5c2afe29c..c51fe6045 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -570,7 +570,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host - faabric::HostResources& r = getHostResources(h); + const faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than @@ -634,7 +634,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); SPDLOG_DEBUG("Remaining: {}", remainder); - faabric::HostResources& r = getHostResources(h); // Get up to date info + const faabric::HostResources r = getHostResources(h); // Get up to date info int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is From eff1f10724568d3445db091825246df240fc7214 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 18:39:25 +0000 Subject: [PATCH 54/84] Fix incorrect variable name in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index c51fe6045..f500ce89f 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -570,7 +570,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); // Work out resources on the remote host - const faabric::HostResources r = getHostResources(h); + const faabric::HostResources r = getHostResources(host); int available = r.slots() - r.usedslots(); // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than From 3e4c246b27ef5487482cdf5d23b3eeb735ba30f2 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 20:41:43 +0000 Subject: [PATCH 55/84] Refactor host registration and scheduling --- src/scheduler/Scheduler.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index f500ce89f..90216cc54 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -566,8 +566,14 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); SPDLOG_DEBUG("Registered Hosts map size: {}", host_resources_pairs.size()); + // Extract a set of registered hosts from host_resources_pairs preserving the order + std::set registeredHostsSet; for (const auto& [host, resources] : host_resources_pairs) { - SPDLOG_INFO("Host: {}, Slots: {}, UsedSlots: {}", host, resources.slots(), resources.usedslots()); + registeredHostsSet.insert(host); + } + + for (const auto& host : registeredHostsSet) { + SPDLOG_INFO("Host: {}", host); // Work out resources on the remote host const faabric::HostResources r = getHostResources(host); @@ -610,8 +616,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( host_resources_pairs.push_back({h, getHostResources(h)}); } } else { - unregisteredHosts = - getUnregisteredHosts(firstMsg.user(), firstMsg.function()); + unregisteredHosts = getUnregisteredHosts(firstMsg.user(), firstMsg.function()); for (auto&& h : unregisteredHosts) { host_resources_pairs.push_back({h, getHostResources(h)}); @@ -623,7 +628,13 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( SPDLOG_DEBUG("Reordered unregistered hosts based on LoadBalancePolicy"); SPDLOG_DEBUG("Unregistered Hosts map size: {}", host_resources_pairs.size()); - for (const auto& [h, resource] : host_resources_pairs) { + // Extract a set of unregistered hosts from host_resources_pairs preserving the order + std::set unregisteredHostsSet; + for (const auto& [host, resources] : host_resources_pairs) { + unregisteredHostsSet.insert(host); + } + + for (const auto& h : unregisteredHostsSet) { // Skip if this host if (h == thisHost) { continue; From 7920f413aa579e5b832ad4adaea66fb5374460e4 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 20:49:35 +0000 Subject: [PATCH 56/84] Fix bug in login functionality*** ***Add validation for user input*** ***Refactor code for better readability*** ***Update error handling in API calls*** ***Optimize database queries for improved performance --- src/scheduler/Scheduler.cpp | 84 ++++++++++--------------------------- 1 file changed, 22 insertions(+), 62 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 90216cc54..db3fe98fb 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -514,10 +514,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( hosts.push_back(thisHost); } } else { - // At this point we know we're the master host, and we've not been - // asked to force full local execution. - - // Work out how many we can handle locally +// Work out how many we can handle locally int slots = thisHostResources.slots(); if (topologyHint == faabric::util::SchedulingTopologyHint::UNDERFULL) { slots = slots / 2; @@ -533,9 +530,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // Make sure we don't execute the wrong kind (storage/compute) of // call locally - if (hostKindDifferent) { - SPDLOG_DEBUG("Host kind different, not scheduling {} locally", - funcStr); + if (hostKindDifferent) { nLocally = 0; } @@ -549,40 +544,23 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // If some are left, we need to distribute. // First try and do so on already registered hosts. int remainder = nMessages - nLocally; - if (!hostKindDifferent && remainder > 0) { - SPDLOG_DEBUG("Scheduling {}/{} of {} on registered hosts", remainder, nMessages, funcStr); - - const std::set& registeredHosts = getFunctionRegisteredHosts(firstMsg.user(), firstMsg.function(), false); - SPDLOG_DEBUG("Number of registered hosts: {}", registeredHosts.size()); - - std::vector> host_resources_pairs; - for (std::string h : registeredHosts) { - host_resources_pairs.push_back(std::make_pair(h, getHostResources(h))); - } - - // MostSlotsPolicy policy; - SPDLOG_DEBUG("Reordering registered hosts based on LoadBalancePolicy"); - policy.dispatch(host_resources_pairs); - SPDLOG_DEBUG("Reordered registered hosts based on LoadBalancePolicy"); - SPDLOG_DEBUG("Registered Hosts map size: {}", host_resources_pairs.size()); - // Extract a set of registered hosts from host_resources_pairs preserving the order - std::set registeredHostsSet; - for (const auto& [host, resources] : host_resources_pairs) { - registeredHostsSet.insert(host); - } + if (!hostKindDifferent && remainder > 0) { + const std::set& thisRegisteredHosts = + getFunctionRegisteredHosts( + firstMsg.user(), firstMsg.function(), false); - for (const auto& host : registeredHostsSet) { - SPDLOG_INFO("Host: {}", host); - + for (const auto& h : thisRegisteredHosts) { // Work out resources on the remote host - const faabric::HostResources r = getHostResources(host); + faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); + // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than // its available slots. available = std::max(0, available); int nOnThisHost = std::min(available, remainder); + // Under the NEVER_ALONE topology hint, we never choose a host // unless we can schedule at least two requests in it. if (topologyHint == @@ -590,14 +568,17 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( nOnThisHost < 2) { continue; } + SPDLOG_TRACE("Scheduling {}/{} of {} on {} (registered)", nOnThisHost, nMessages, funcStr, - host); + h); + for (int i = 0; i < nOnThisHost; i++) { - hosts.push_back(host); + hosts.push_back(h); } + remainder -= nOnThisHost; if (remainder <= 0) { break; @@ -609,32 +590,16 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::string lastHost; if (remainder > 0) { std::vector unregisteredHosts; - std::vector> host_resources_pairs; - if (hostKindDifferent) { for (auto&& h : getAvailableHostsForFunction(firstMsg)) { - host_resources_pairs.push_back({h, getHostResources(h)}); + unregisteredHosts.push_back(std::move(h)); } } else { - unregisteredHosts = getUnregisteredHosts(firstMsg.user(), firstMsg.function()); - - for (auto&& h : unregisteredHosts) { - host_resources_pairs.push_back({h, getHostResources(h)}); - } - } - - SPDLOG_DEBUG("Reordering unregistered hosts based on LoadBalancePolicy"); - policy.dispatch(host_resources_pairs); - SPDLOG_DEBUG("Reordered unregistered hosts based on LoadBalancePolicy"); - SPDLOG_DEBUG("Unregistered Hosts map size: {}", host_resources_pairs.size()); - - // Extract a set of unregistered hosts from host_resources_pairs preserving the order - std::set unregisteredHostsSet; - for (const auto& [host, resources] : host_resources_pairs) { - unregisteredHostsSet.insert(host); + unregisteredHosts = + getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } - for (const auto& h : unregisteredHostsSet) { + for (const auto& h : unregisteredHosts) { // Skip if this host if (h == thisHost) { continue; @@ -642,27 +607,22 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( lastHost = h; // Work out resources on the remote host - SPDLOG_DEBUG("Checkig unregeistered {} for resources", h); - SPDLOG_DEBUG("Remaining: {}", remainder); - - const faabric::HostResources r = getHostResources(h); // Get up to date info - + faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); + // We need to floor at zero here in case the remote host is // overloaded, in which case its used slots will be greater than // its available slots. available = std::max(0, available); int nOnThisHost = std::min(available, remainder); - SPDLOG_DEBUG("Unregisted Host Available Slots: {}, nOnThisHost: {}", available, nOnThisHost); - if (topologyHint == faabric::util::SchedulingTopologyHint::NEVER_ALONE && nOnThisHost < 2) { continue; } - SPDLOG_DEBUG("Scheduling {}/{} of {} on {} (unregistered)", + SPDLOG_TRACE("Scheduling {}/{} of {} on {} (unregistered)", nOnThisHost, nMessages, funcStr, From 96db3fafbb9a41492aae80ec451e20f253346eaa Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 21:33:14 +0000 Subject: [PATCH 57/84] added policy decisions to registered function decision making --- src/scheduler/Scheduler.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index db3fe98fb..04c8b883e 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -514,7 +514,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( hosts.push_back(thisHost); } } else { -// Work out how many we can handle locally + // Work out how many we can handle locally int slots = thisHostResources.slots(); if (topologyHint == faabric::util::SchedulingTopologyHint::UNDERFULL) { slots = slots / 2; @@ -550,7 +550,25 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); - for (const auto& h : thisRegisteredHosts) { + // Get all the pairings of host and resources + std::vector> host_resource_pairs; + for (const auto& h : thisRegisteredHosts) + { + faabric::HostResources r = getHostResources(h); + host_resource_pairs.push_back(std::make_pair(h, r)); + } + + // Apply policy ordering + policy.dispatch(host_resource_pairs); + + std::vector ordered_registered_hosts; + for (const auto& [h, r] : host_resource_pairs) + { + ordered_registered_hosts.push_back(h); + } + + // Loop through the ordered registered hosts and schedule as many as possible on each + for (const auto& h : ordered_registered_hosts) { // Work out resources on the remote host faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); From 06e80c7d695fee8fb2b79a5346305fdb154a6faa Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 21:47:11 +0000 Subject: [PATCH 58/84] Apply load balancing policy to scheduler --- include/faabric/scheduler/Scheduler.h | 7 +++++ src/scheduler/Scheduler.cpp | 42 +++++++++++++++++---------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 20d2ffcf6..e45543a1c 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -466,12 +466,19 @@ class Scheduler faabric::util::SchedulingTopologyHint topologyHint, std::shared_ptr extraData); + std::set applyLoadBalancedPolicy( + const std::set& hosts, + const faabric::Message& msg, + faabric::util::SchedulingTopologyHint topologyHint); + std::shared_ptr claimExecutor(const faabric::MessageInBatch& msg); std::vector getUnregisteredHosts(const std::string& user, const std::string& function, bool noCache = false); + FaasmDefaultPolicy policy; + // ---- Accounting and debugging ---- std::vector recordedMessagesAll; std::vector recordedMessagesLocal; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 04c8b883e..367e1ca43 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -550,22 +550,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); - // Get all the pairings of host and resources - std::vector> host_resource_pairs; - for (const auto& h : thisRegisteredHosts) - { - faabric::HostResources r = getHostResources(h); - host_resource_pairs.push_back(std::make_pair(h, r)); - } - - // Apply policy ordering - policy.dispatch(host_resource_pairs); - - std::vector ordered_registered_hosts; - for (const auto& [h, r] : host_resource_pairs) - { - ordered_registered_hosts.push_back(h); - } + thisRegisteredHosts = applyLoadBalancedPolicy(thisRegisteredHosts, firstMsg, topologyHint); // Loop through the ordered registered hosts and schedule as many as possible on each for (const auto& h : ordered_registered_hosts) { @@ -617,6 +602,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } + unregisteredHosts = applyLoadBalancedPolicy(unregisteredHosts, firstMsg, topologyHint); + for (const auto& h : unregisteredHosts) { // Skip if this host if (h == thisHost) { @@ -1033,6 +1020,29 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( return decision; } +std::set Scheduler::applyLoadBalancedPolicy(const std::set& hosts, const faabric::Message& msg, faabric::util::SchedulingTopologyHint topologyHint) +{ + std::vector> host_resource_pairs; + + // Fetch resources for each host to inform decision + for (const auto& h : hosts) + { + faabric::HostResources r = getHostResources(h); + host_resource_pairs.push_back(std::make_pair(h, r)); + } + + // Apply ordering to the pairs + this->policy.dispatch(host_resource_pairs); + + // Extract the ordered hosts + std::set ordered_hosts; + for (const auto& [h, r] : host_resource_pairs) + { + ordered_hosts.insert(h); + } + + return ordered_hosts; +} std::vector Scheduler::getUnregisteredHosts( const std::string& user, const std::string& function, From 2c9a58fe0790c12dd805c5f995c010c751335b6a Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 21:56:21 +0000 Subject: [PATCH 59/84] fixed importation errors --- include/faabric/scheduler/Scheduler.h | 1 + src/scheduler/Scheduler.cpp | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index e45543a1c..51bd9b771 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 367e1ca43..da8aeadde 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include From fd1b21a7984ab9cd3569f60e8637c3b2feeef836 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:00:33 +0000 Subject: [PATCH 60/84] removed redundant parameters from the applyLoadBalancedPolicy --- include/faabric/scheduler/Scheduler.h | 5 +---- src/scheduler/Scheduler.cpp | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 51bd9b771..0a3958c4a 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -467,10 +467,7 @@ class Scheduler faabric::util::SchedulingTopologyHint topologyHint, std::shared_ptr extraData); - std::set applyLoadBalancedPolicy( - const std::set& hosts, - const faabric::Message& msg, - faabric::util::SchedulingTopologyHint topologyHint); + std::set applyLoadBalancedPolicy(std::set hosts); std::shared_ptr claimExecutor(const faabric::MessageInBatch& msg); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index da8aeadde..755dfaeb5 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -549,7 +549,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); - thisRegisteredHosts = applyLoadBalancedPolicy(thisRegisteredHosts, firstMsg, topologyHint); + thisRegisteredHosts = applyLoadBalancedPolicy(thisRegisteredHosts); // Loop through the ordered registered hosts and schedule as many as possible on each for (const auto& h : ordered_registered_hosts) { @@ -601,7 +601,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } - unregisteredHosts = applyLoadBalancedPolicy(unregisteredHosts, firstMsg, topologyHint); + unregisteredHosts = applyLoadBalancedPolicy(unregisteredHosts); for (const auto& h : unregisteredHosts) { // Skip if this host @@ -1019,7 +1019,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( return decision; } -std::set Scheduler::applyLoadBalancedPolicy(const std::set& hosts, const faabric::Message& msg, faabric::util::SchedulingTopologyHint topologyHint) +std::set Scheduler::applyLoadBalancedPolicy(std::set hosts) { std::vector> host_resource_pairs; From 7a41234adc61371e99c9643eacdbb37579eafd2c Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:02:08 +0000 Subject: [PATCH 61/84] Add FaasmDefaultPolicy initialization in Scheduler constructor --- src/scheduler/Scheduler.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 755dfaeb5..49ba57427 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -99,6 +99,8 @@ Scheduler::Scheduler() } this->updateMonitoring(); } + + policy = FaasmDefaultPolicy(); } Scheduler::~Scheduler() From 3ea3a66a1e368b088be7b18e2066d39b754d8e10 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:09:37 +0000 Subject: [PATCH 62/84] Remove unused header and initialize policy in Scheduler constructor --- include/faabric/scheduler/Scheduler.h | 1 - src/scheduler/Scheduler.cpp | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 0a3958c4a..7fbd6d92f 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 49ba57427..3eb11b5bb 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -21,6 +21,8 @@ #include #include +#include + #include #include #include @@ -99,8 +101,6 @@ Scheduler::Scheduler() } this->updateMonitoring(); } - - policy = FaasmDefaultPolicy(); } Scheduler::~Scheduler() @@ -1023,6 +1023,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( std::set Scheduler::applyLoadBalancedPolicy(std::set hosts) { + FaasmDefaultPolicy policy; std::vector> host_resource_pairs; // Fetch resources for each host to inform decision From 18531512515c8f0c12acb2e85f45ad32811a5d07 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:13:16 +0000 Subject: [PATCH 63/84] Refactor load balancing policy in Scheduler class --- include/faabric/scheduler/Scheduler.h | 2 -- src/scheduler/Scheduler.cpp | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 7fbd6d92f..18d31b65b 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -474,8 +474,6 @@ class Scheduler const std::string& function, bool noCache = false); - FaasmDefaultPolicy policy; - // ---- Accounting and debugging ---- std::vector recordedMessagesAll; std::vector recordedMessagesLocal; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 3eb11b5bb..8cfc5f0bb 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -551,10 +551,10 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); - thisRegisteredHosts = applyLoadBalancedPolicy(thisRegisteredHosts); + std::set balanced_registered_hosts = applyLoadBalancedPolicy(thisRegisteredHosts); // Loop through the ordered registered hosts and schedule as many as possible on each - for (const auto& h : ordered_registered_hosts) { + for (const auto& h : balanced_registered_hosts) { // Work out resources on the remote host faabric::HostResources r = getHostResources(h); int available = r.slots() - r.usedslots(); @@ -603,9 +603,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } - unregisteredHosts = applyLoadBalancedPolicy(unregisteredHosts); + std::set balanced_unregistered_hosts = applyLoadBalancedPolicy(unregisteredHosts); - for (const auto& h : unregisteredHosts) { + for (const auto& h : balanced_unregistered_hosts) { // Skip if this host if (h == thisHost) { continue; From ab573918fe7f0473c96ab8c8fd71296c39117f11 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:20:40 +0000 Subject: [PATCH 64/84] Update Scheduler class to use vector instead of set in applyLoadBalancedPolicy method --- include/faabric/scheduler/Scheduler.h | 2 +- src/scheduler/Scheduler.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index 18d31b65b..cb0d2dbef 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -466,7 +466,7 @@ class Scheduler faabric::util::SchedulingTopologyHint topologyHint, std::shared_ptr extraData); - std::set applyLoadBalancedPolicy(std::set hosts); + std::set applyLoadBalancedPolicy(std::vector hosts); std::shared_ptr claimExecutor(const faabric::MessageInBatch& msg); diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 8cfc5f0bb..1c4f061e5 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -603,6 +603,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } + // Convert unregistered hosts to a set + std::set balanced_unregistered_hosts = applyLoadBalancedPolicy(unregisteredHosts); for (const auto& h : balanced_unregistered_hosts) { @@ -1021,7 +1023,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( return decision; } -std::set Scheduler::applyLoadBalancedPolicy(std::set hosts) +std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts) { FaasmDefaultPolicy policy; std::vector> host_resource_pairs; @@ -1034,7 +1036,7 @@ std::set Scheduler::applyLoadBalancedPolicy(std::set h } // Apply ordering to the pairs - this->policy.dispatch(host_resource_pairs); + thispolicy.dispatch(host_resource_pairs); // Extract the ordered hosts std::set ordered_hosts; From 89467089fb1512d8867642f270918fe2204aff80 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:21:18 +0000 Subject: [PATCH 65/84] Refactor dispatch method in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 1c4f061e5..3f2c827c3 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1036,7 +1036,7 @@ std::set Scheduler::applyLoadBalancedPolicy(std::vector ordered_hosts; From 9fa968ac2b315311e465c551318fc1a07b279c0b Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:25:02 +0000 Subject: [PATCH 66/84] Update user authentication logic --- src/scheduler/Scheduler.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 3f2c827c3..b252aff86 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -551,7 +551,12 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); - std::set balanced_registered_hosts = applyLoadBalancedPolicy(thisRegisteredHosts); + std::vector registeredHosts; + for (const auto& h : thisRegisteredHosts) { + registeredHosts.push_back(h); + } + + std::set balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts); // Loop through the ordered registered hosts and schedule as many as possible on each for (const auto& h : balanced_registered_hosts) { From f0416110c489611d82682227e10caa7ea7560ef2 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:34:59 +0000 Subject: [PATCH 67/84] Refactor host balancing in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index b252aff86..b3ffe9581 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -608,11 +608,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } - // Convert unregistered hosts to a set - std::set balanced_unregistered_hosts = applyLoadBalancedPolicy(unregisteredHosts); - - for (const auto& h : balanced_unregistered_hosts) { + for (const auto& h : unregisteredHosts) { // Skip if this host if (h == thisHost) { continue; From ed57c4fba385b5209f519e1234dc8466521bac4b Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:35:38 +0000 Subject: [PATCH 68/84] Update load balancing policy to MostSlotsPolicy --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index b3ffe9581..04fff49b4 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1027,7 +1027,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts) { - FaasmDefaultPolicy policy; + MostSlotsPolicy policy; std::vector> host_resource_pairs; // Fetch resources for each host to inform decision From ebecda1d394a2443cdc3c7343bf4cc1cd50c250b Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Thu, 15 Feb 2024 22:45:22 +0000 Subject: [PATCH 69/84] Refactor scheduling logic and apply FaasmDefaultPolicy*** --- src/scheduler/Scheduler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 04fff49b4..5d46f008b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -598,6 +598,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( // Now schedule to unregistered hosts if there are messages left std::string lastHost; if (remainder > 0) { + // Do not edit any of this code! It is a critical section and must be left as is :) std::vector unregisteredHosts; if (hostKindDifferent) { for (auto&& h : getAvailableHostsForFunction(firstMsg)) { @@ -1027,7 +1028,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts) { - MostSlotsPolicy policy; + FaasmDefaultPolicy policy; std::vector> host_resource_pairs; // Fetch resources for each host to inform decision From b63ebd1643da13e1db6ab0ece9855d7878173865 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Fri, 16 Feb 2024 20:14:23 +0000 Subject: [PATCH 70/84] Add load balancing policy and thresholds to SystemConfig --- include/faabric/util/config.h | 5 +++++ src/scheduler/Scheduler.cpp | 14 +++++++++++++- src/util/config.cpp | 16 +++++++++++++--- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/faabric/util/config.h b/include/faabric/util/config.h index c7dfce414..f082ca3c4 100644 --- a/include/faabric/util/config.h +++ b/include/faabric/util/config.h @@ -32,6 +32,11 @@ class SystemConfig bool isStorageNode; int noSingleHostOptimisations; + std::string load_balance_policy; + double offload_cpu_threshold; + double offload_ram_threshold; + double offload_load_avg_threshold; + // Worker-related timeouts int globalMessageTimeout; int boundTimeout; diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 5d46f008b..68107c64b 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1028,7 +1028,19 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts) { - FaasmDefaultPolicy policy; + // get load policy from config + std::string policyName = faabric::util::getSystemConfig().load_balance_policy; + + LoadBalancedPolicy policy; + if (policyName == "faasm_default") { + policy = FaasmDefaultPolicy(); + } else if (policyName == "most_slots") { + policy = MostSlotsPolicy(); + } else { + SPDLOG_ERROR("Unknown load balance policy: {}", policyName); + throw std::runtime_error("Unknown load balance policy"); + } + std::vector> host_resource_pairs; // Fetch resources for each host to inform decision diff --git a/src/util/config.cpp b/src/util/config.cpp index e417c8caa..7605c9abb 100644 --- a/src/util/config.cpp +++ b/src/util/config.cpp @@ -36,9 +36,11 @@ void SystemConfig::initialise() overrideCpuCount = this->getSystemConfIntParam("OVERRIDE_CPU_COUNT", "0"); noTopologyHints = getEnvVar("NO_TOPOLOGY_HINTS", "off"); isStorageNode = this->getSystemConfIntParam("IS_STORAGE_NODE", "0"); - noSingleHostOptimisations = - this->getSystemConfIntParam("NO_SINGLE_HOST", "0"); - + noSingleHostOptimisations = this->getSystemConfIntParam("NO_SINGLE_HOST", "0"); + load_balance_policy = getEnvVar("LOAD_BALANCE_POLICY", "faasm_default"); + offload_cpu_threshold = stod(getEnvVar("OFFLOAD_CPU_THRESHOLD", "0.8")); + offload_ram_threshold = stod(getEnvVar("OFFLOAD_RAM_THRESHOLD", "0.8")); + offload_load_avg_threshold = stod(getEnvVar("OFFLOAD_LOAD_AVG_THRESHOLD", "0.8")); // Worker-related timeouts (all in seconds) globalMessageTimeout = this->getSystemConfIntParam("GLOBAL_MESSAGE_TIMEOUT", "60000"); @@ -115,6 +117,14 @@ void SystemConfig::print() SPDLOG_INFO("OVERRIDE_CPU_COUNT {}", overrideCpuCount); SPDLOG_INFO("NO_TOPOLOGY_HINTS {}", noTopologyHints); SPDLOG_INFO("IS_STORAGE_NODE {}", isStorageNode); + SPDLOG_INFO("LOAD_BALANCE_POLICY {}", load_balance_policy); + + if (isStorageNode) { + SPDLOG_INFO("--- Metrics Collection Module ---"); + SPDLOG_INFO("OFFLOAD_CPU_THRESHOLD {}", offload_cpu_threshold); + SPDLOG_INFO("OFFLOAD_RAM_THRESHOLD {}", offload_ram_threshold); + SPDLOG_INFO("OFFLOAD_LOAD_AVG_THRESHOLD {}", offload_load_avg_threshold); + } SPDLOG_INFO("--- Timeouts ---"); SPDLOG_INFO("GLOBAL_MESSAGE_TIMEOUT {}", globalMessageTimeout); From cb2fcf389dfaafffdfa60f917a3d425b908af6b7 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Fri, 16 Feb 2024 21:10:00 +0000 Subject: [PATCH 71/84] Refactor load balancing policy in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 46 ++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 68107c64b..eeb91aa8f 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -443,8 +443,21 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { - // MostSlotsPolicy policy; - FaasmDefaultPolicy policy; + + // get load policy from config + std::string policyName = faabric::util::getSystemConfig().load_balance_policy; + + FaasmDefaultPolicy faasm_default_policy; + MostSlotsPolicy most_slots_policy; + + if (policyName == "faasm_default") { + policy = FaasmDefaultPolicy(); + } else if (policyName == "most_slots") { + policy = MostSlotsPolicy(); + } else { + SPDLOG_ERROR("Unknown load balance policy: {}", policyName); + throw std::runtime_error("Unknown load balance policy"); + } ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); int nMessages = req->messages_size(); const faabric::Message& firstMsg = req->messages().at(0); @@ -555,9 +568,17 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( for (const auto& h : thisRegisteredHosts) { registeredHosts.push_back(h); } - - std::set balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts); - + + // if conf.load_balance_policy == "faasm_default" { + std::set balanced_registered_hosts; + if (policyName == "faasm_default") { + balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts, faasm_default_policy); + } else if (policyName == "most_slots") { + balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts, most_slots_policy); + } else { + SPDLOG_ERROR("Unknown load balance policy: {}", policyName); + balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts, faasm_default_policy); + } // Loop through the ordered registered hosts and schedule as many as possible on each for (const auto& h : balanced_registered_hosts) { // Work out resources on the remote host @@ -1026,21 +1047,8 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( return decision; } -std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts) +std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts, LoadBalancePolicy& policy) { - // get load policy from config - std::string policyName = faabric::util::getSystemConfig().load_balance_policy; - - LoadBalancedPolicy policy; - if (policyName == "faasm_default") { - policy = FaasmDefaultPolicy(); - } else if (policyName == "most_slots") { - policy = MostSlotsPolicy(); - } else { - SPDLOG_ERROR("Unknown load balance policy: {}", policyName); - throw std::runtime_error("Unknown load balance policy"); - } - std::vector> host_resource_pairs; // Fetch resources for each host to inform decision From 54b0b9914615086b5ad2ad95104739115d4123dc Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Fri, 16 Feb 2024 21:13:38 +0000 Subject: [PATCH 72/84] Refactor load balancing policy in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 48 +++++++++++++++---------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index eeb91aa8f..aa9a46eb6 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -443,21 +443,8 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { - - // get load policy from config - std::string policyName = faabric::util::getSystemConfig().load_balance_policy; - - FaasmDefaultPolicy faasm_default_policy; - MostSlotsPolicy most_slots_policy; - - if (policyName == "faasm_default") { - policy = FaasmDefaultPolicy(); - } else if (policyName == "most_slots") { - policy = MostSlotsPolicy(); - } else { - SPDLOG_ERROR("Unknown load balance policy: {}", policyName); - throw std::runtime_error("Unknown load balance policy"); - } + // MostSlotsPolicy policy; + FaasmDefaultPolicy policy; ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); int nMessages = req->messages_size(); const faabric::Message& firstMsg = req->messages().at(0); @@ -568,17 +555,9 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( for (const auto& h : thisRegisteredHosts) { registeredHosts.push_back(h); } - - // if conf.load_balance_policy == "faasm_default" { - std::set balanced_registered_hosts; - if (policyName == "faasm_default") { - balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts, faasm_default_policy); - } else if (policyName == "most_slots") { - balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts, most_slots_policy); - } else { - SPDLOG_ERROR("Unknown load balance policy: {}", policyName); - balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts, faasm_default_policy); - } + + std::set balanced_registered_hosts = applyLoadBalancedPolicy(registeredHosts); + // Loop through the ordered registered hosts and schedule as many as possible on each for (const auto& h : balanced_registered_hosts) { // Work out resources on the remote host @@ -1047,8 +1026,10 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions( return decision; } -std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts, LoadBalancePolicy& policy) +std::set Scheduler::applyLoadBalancedPolicy(std::vector hosts) { + // get load policy from config + std::string policyName = faabric::util::getSystemConfig().load_balance_policy; std::vector> host_resource_pairs; // Fetch resources for each host to inform decision @@ -1058,8 +1039,17 @@ std::set Scheduler::applyLoadBalancedPolicy(std::vector ordered_hosts; From a6fa85650bd49006108a1b7223f31e0d665aac0d Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Fri, 16 Feb 2024 21:15:12 +0000 Subject: [PATCH 73/84] added default policy to exception handling --- src/scheduler/Scheduler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index aa9a46eb6..e41fd7729 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1048,6 +1048,7 @@ std::set Scheduler::applyLoadBalancedPolicy(std::vector Date: Mon, 19 Feb 2024 16:15:22 +0000 Subject: [PATCH 74/84] Add debug logs for load balancing policies --- src/scheduler/Scheduler.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index e41fd7729..6f2ca90e6 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1042,9 +1042,11 @@ std::set Scheduler::applyLoadBalancedPolicy(std::vector Date: Mon, 19 Feb 2024 17:18:15 +0000 Subject: [PATCH 75/84] Remove metrics collection module from SystemConfig print() function --- src/util/config.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/util/config.cpp b/src/util/config.cpp index 7605c9abb..45082df0f 100644 --- a/src/util/config.cpp +++ b/src/util/config.cpp @@ -118,13 +118,6 @@ void SystemConfig::print() SPDLOG_INFO("NO_TOPOLOGY_HINTS {}", noTopologyHints); SPDLOG_INFO("IS_STORAGE_NODE {}", isStorageNode); SPDLOG_INFO("LOAD_BALANCE_POLICY {}", load_balance_policy); - - if (isStorageNode) { - SPDLOG_INFO("--- Metrics Collection Module ---"); - SPDLOG_INFO("OFFLOAD_CPU_THRESHOLD {}", offload_cpu_threshold); - SPDLOG_INFO("OFFLOAD_RAM_THRESHOLD {}", offload_ram_threshold); - SPDLOG_INFO("OFFLOAD_LOAD_AVG_THRESHOLD {}", offload_load_avg_threshold); - } SPDLOG_INFO("--- Timeouts ---"); SPDLOG_INFO("GLOBAL_MESSAGE_TIMEOUT {}", globalMessageTimeout); From 0cde3d69964cd14ddc76deaf663c8c3dddce0e0f Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 20:45:28 +0000 Subject: [PATCH 76/84] Add load average to HostResources message and update Scheduler class --- include/faabric/scheduler/Scheduler.h | 1 + src/proto/faabric.proto | 1 + src/scheduler/Scheduler.cpp | 2 ++ 3 files changed, 4 insertions(+) diff --git a/include/faabric/scheduler/Scheduler.h b/include/faabric/scheduler/Scheduler.h index cb0d2dbef..8823f6002 100644 --- a/include/faabric/scheduler/Scheduler.h +++ b/include/faabric/scheduler/Scheduler.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto index f85af23ed..ad607dd1d 100644 --- a/src/proto/faabric.proto +++ b/src/proto/faabric.proto @@ -47,6 +47,7 @@ message BatchExecuteRequest { message HostResources { int32 slots = 1; int32 usedSlots = 2; + double loadAverage = 3; } message UnregisterRequest { diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 6f2ca90e6..e764b1cef 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1691,6 +1691,8 @@ faabric::HostResources Scheduler::getThisHostResources() faabric::HostResources hostResources = thisHostResources; hostResources.set_usedslots( this->thisHostUsedSlots.load(std::memory_order_acquire)); + hostResources.set_loadaverage(faabric::util::getLoadAverage()) + SPDLOG_DEBUG("Set load average to {}", hostResources.loadaverage()); return hostResources; } From 14b241bc5f08c61faa2ee857f8866d3a328ecc22 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 20:51:46 +0000 Subject: [PATCH 77/84] Add getLoadAverage() function to system metrics --- include/faabric/util/system_metrics.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/faabric/util/system_metrics.h b/include/faabric/util/system_metrics.h index 06aa4399e..452918c06 100644 --- a/include/faabric/util/system_metrics.h +++ b/include/faabric/util/system_metrics.h @@ -32,4 +32,5 @@ namespace faabric::util { UtilisationStats getSystemUtilisation(); CPUStats getCPUUtilisation(); double getMemoryUtilisation(); + double getLoadAverage(); } \ No newline at end of file From 3ad6e8c19329dd87a9124a5bfb5470b071909cb1 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 20:55:15 +0000 Subject: [PATCH 78/84] Add pragma once directive to system_metrics.h --- include/faabric/util/system_metrics.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/faabric/util/system_metrics.h b/include/faabric/util/system_metrics.h index 452918c06..eafea3188 100644 --- a/include/faabric/util/system_metrics.h +++ b/include/faabric/util/system_metrics.h @@ -1,3 +1,4 @@ +#pragma once #include #include From ef1b2d0fceb01ff9996b4491c49fb0c92ecfa2a9 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 21:02:53 +0000 Subject: [PATCH 79/84] Refactor load average calculation in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index e764b1cef..b5410027a 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1691,7 +1691,8 @@ faabric::HostResources Scheduler::getThisHostResources() faabric::HostResources hostResources = thisHostResources; hostResources.set_usedslots( this->thisHostUsedSlots.load(std::memory_order_acquire)); - hostResources.set_loadaverage(faabric::util::getLoadAverage()) + auto load_average = faabric::util::getLoadAverage(); + hostResources.set_loadaverage(load_average); SPDLOG_DEBUG("Set load average to {}", hostResources.loadaverage()); return hostResources; } From 59756dc06049a9d04a460766a9d685c7efbf9039 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 21:18:16 +0000 Subject: [PATCH 80/84] Implement least load average policy for load balancing --- src/loadbalance/LeastLoadAveragePolicy.cpp | 7 ++++++- src/scheduler/Scheduler.cpp | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/loadbalance/LeastLoadAveragePolicy.cpp b/src/loadbalance/LeastLoadAveragePolicy.cpp index d948c99a4..47f7ac349 100644 --- a/src/loadbalance/LeastLoadAveragePolicy.cpp +++ b/src/loadbalance/LeastLoadAveragePolicy.cpp @@ -3,5 +3,10 @@ std::vector> LeastLoadAveragePolicy::dispatch(std::vector>& host_resources) { - throw std::runtime_error("LeastLoadAveragePolicy::dispatch not implemented"); + // Sort the vector by the load average in ascending order + std::sort(host_resources.begin(), host_resources.end(), [](const auto &a, const auto &b) { + return a.second.loadaverage() < b.second.loadaverage(); + }); + + return host_resources; } \ No newline at end of file diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index b5410027a..0407d9bd7 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -1048,6 +1048,10 @@ std::set Scheduler::applyLoadBalancedPolicy(std::vectorthisHostUsedSlots.load(std::memory_order_acquire)); auto load_average = faabric::util::getLoadAverage(); hostResources.set_loadaverage(load_average); - SPDLOG_DEBUG("Set load average to {}", hostResources.loadaverage()); return hostResources; } From 5d7012240e68471f465e6444fb211e23a8d73918 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 21:20:26 +0000 Subject: [PATCH 81/84] Remove unused policy and update function signature in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 0407d9bd7..455a9f707 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -443,8 +443,6 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::shared_ptr req, faabric::util::SchedulingTopologyHint topologyHint) { - // MostSlotsPolicy policy; - FaasmDefaultPolicy policy; ZoneScopedNS("Scheduler::makeSchedulingDecision", 5); int nMessages = req->messages_size(); const faabric::Message& firstMsg = req->messages().at(0); From c14f513602157c8d97ee2313aef9853460f4d1fd Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 22:41:34 +0000 Subject: [PATCH 82/84] Add debug log statements to get registered and available hosts --- src/scheduler/Scheduler.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 455a9f707..0c1b82318 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -545,6 +545,7 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( int remainder = nMessages - nLocally; if (!hostKindDifferent && remainder > 0) { + SPDLOG_DEBUG("Getting registered hosts for {}/{}", firstMsg.user(), firstMsg.function()); const std::set& thisRegisteredHosts = getFunctionRegisteredHosts( firstMsg.user(), firstMsg.function(), false); @@ -597,12 +598,15 @@ faabric::util::SchedulingDecision Scheduler::doSchedulingDecision( std::string lastHost; if (remainder > 0) { // Do not edit any of this code! It is a critical section and must be left as is :) + std::vector unregisteredHosts; if (hostKindDifferent) { + SPDLOG_DEBUG("Getting available hosts for {}/{}", firstMsg.user(), firstMsg.function()); for (auto&& h : getAvailableHostsForFunction(firstMsg)) { unregisteredHosts.push_back(std::move(h)); } } else { + SPDLOG_DEBUG("Getting unregistered hosts"); unregisteredHosts = getUnregisteredHosts(firstMsg.user(), firstMsg.function()); } From f0ed961c882e3f49dcf7e8fccc52dbed402dc579 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Mon, 19 Feb 2024 23:11:22 +0000 Subject: [PATCH 83/84] Add debug log for acquiring lock in getFunctionRegisteredHosts() function --- src/scheduler/Scheduler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 0c1b82318..63fae3e9d 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -347,6 +347,7 @@ const std::set& Scheduler::getFunctionRegisteredHosts( { faabric::util::SharedLock lock; if (acquireLock) { + SPDLOG_DEBUG("Acquiring lock for registered hosts") lock = faabric::util::SharedLock(mx); } std::string key = user + "/" + func; From 5e63c7d7785117f3ae1ac491974f48e74393fbc4 Mon Sep 17 00:00:00 2001 From: Donald Jennings Date: Tue, 20 Feb 2024 12:54:31 +0000 Subject: [PATCH 84/84] Fix acquiring lock for registered hosts in Scheduler.cpp --- src/scheduler/Scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp index 63fae3e9d..4eab73428 100644 --- a/src/scheduler/Scheduler.cpp +++ b/src/scheduler/Scheduler.cpp @@ -347,7 +347,7 @@ const std::set& Scheduler::getFunctionRegisteredHosts( { faabric::util::SharedLock lock; if (acquireLock) { - SPDLOG_DEBUG("Acquiring lock for registered hosts") + SPDLOG_DEBUG("Acquiring lock for registered hosts"); lock = faabric::util::SharedLock(mx); } std::string key = user + "/" + func;