From bdf67f36921663adf65aa23206df21eff6c81167 Mon Sep 17 00:00:00 2001 From: Krazer Date: Thu, 26 Mar 2026 00:28:36 -0500 Subject: [PATCH] fix llama config --- src/arbiterAI/hardwareDetector.cpp | 29 ++++- src/server/routes.cpp | 47 +++++--- tests/llamaProviderTests.cpp | 179 ++++++++++++++++++++++++++++- 3 files changed, 231 insertions(+), 24 deletions(-) diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp index d2cdd7c..0d292d0 100644 --- a/src/arbiterAI/hardwareDetector.cpp +++ b/src/arbiterAI/hardwareDetector.cpp @@ -559,6 +559,14 @@ void HardwareDetector::detectVulkanGpus() VkPhysicalDeviceProperties props{}; getProperties(devices[i], &props); + // Skip CPU-based software renderers (e.g. llvmpipe) — they report + // system RAM as VRAM and are not useful for model inference. + if(props.deviceType==VK_PHYSICAL_DEVICE_TYPE_CPU) + { + spdlog::debug("Skipping Vulkan software renderer: {}", props.deviceName); + continue; + } + // Skip devices already detected via NVML (match by name) bool alreadyDetected=false; for(const GpuInfo &existing:m_systemInfo.gpus) @@ -665,8 +673,16 @@ void HardwareDetector::detectUnifiedMemory() if(matchedCardPath.empty()) { - spdlog::debug("No amdgpu sysfs match for integrated GPU {}: {}", - gpu.index, gpu.name); + // No sysfs match — fall back to system RAM as GPU-accessible pool. + // Unified memory GPUs share system RAM, so the GPU can access most of it. + // Use total system RAM as the accessible pool estimate. + gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb; + gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb; + + spdlog::info("Unified memory GPU {}: {} — no sysfs match, " + "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool", + gpu.index, gpu.name, + gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb); continue; } @@ -730,7 +746,14 @@ void HardwareDetector::detectUnifiedMemory() } else { - spdlog::debug("No GTT info for integrated GPU {}: {}", gpu.index, gpu.name); + // sysfs card matched but no GTT info — fall back to system RAM + gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb; + gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb; + + spdlog::info("Unified memory GPU {}: {} — no GTT info, " + "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool", + gpu.index, gpu.name, + gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb); } } #endif diff --git a/src/server/routes.cpp b/src/server/routes.cpp index b0ca906..a3994d6 100644 --- a/src/server/routes.cpp +++ b/src/server/routes.cpp @@ -808,30 +808,39 @@ void handleGetLoadedModels(const httplib::Request &, httplib::Response &res) void handleLoadModel(const httplib::Request &req, httplib::Response &res) { - std::string modelName=req.matches[1]; - std::string variant; - int contextSize=0; + try + { + std::string modelName=req.matches[1]; + std::string variant; + int contextSize=0; - if(req.has_param("variant")) - variant=req.get_param_value("variant"); - if(req.has_param("context")) - contextSize=std::stoi(req.get_param_value("context")); + if(req.has_param("variant")) + variant=req.get_param_value("variant"); + if(req.has_param("context")) + contextSize=std::stoi(req.get_param_value("context")); - ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize); + ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize); - if(err==ErrorCode::Success) - { - res.set_content(nlohmann::json{{"status", "loaded"}, {"model", modelName}}.dump(), "application/json"); - } - else if(err==ErrorCode::ModelDownloading) - { - res.status=202; - res.set_content(nlohmann::json{{"status", "downloading"}, {"model", modelName}}.dump(), "application/json"); + if(err==ErrorCode::Success) + { + res.set_content(nlohmann::json{{"status", "loaded"}, {"model", modelName}}.dump(), "application/json"); + } + else if(err==ErrorCode::ModelDownloading) + { + res.status=202; + res.set_content(nlohmann::json{{"status", "downloading"}, {"model", modelName}}.dump(), "application/json"); + } + else + { + res.status=400; + res.set_content(errorJson("Failed to load model: "+errorCodeToString(err), "invalid_request_error", "model", errorCodeToString(err)).dump(), "application/json"); + } } - else + catch(const std::exception &e) { - res.status=400; - res.set_content(errorJson("Failed to load model: "+errorCodeToString(err), "invalid_request_error", "model", errorCodeToString(err)).dump(), "application/json"); + spdlog::error("Exception in handleLoadModel: {}", e.what()); + res.status=500; + res.set_content(errorJson(std::string("Internal error: ")+e.what(), "server_error").dump(), "application/json"); } } diff --git a/tests/llamaProviderTests.cpp b/tests/llamaProviderTests.cpp index 03541e1..b535c68 100644 --- a/tests/llamaProviderTests.cpp +++ b/tests/llamaProviderTests.cpp @@ -6,14 +6,17 @@ #include "arbiterAI/telemetryCollector.h" #include "arbiterAI/modelManager.h" +#include #include #include namespace arbiterAI { -static const std::string MODEL_NAME="qwen2.5-7b-instruct"; -static const std::string SMALL_MODEL_NAME="qwen2.5-1.5b-instruct"; +static const std::string MODEL_NAME="Qwen2.5-7B-Instruct"; +static const std::string SMALL_MODEL_NAME="Qwen2.5-1.5B-Instruct"; +static const std::string MODEL_FILE="Qwen2.5-7B-Instruct-Q4_K_M.gguf"; +static const std::string SMALL_MODEL_FILE="Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"; class LlamaProviderTest : public ::testing::Test { protected: @@ -37,6 +40,13 @@ class LlamaProviderTest : public ::testing::Test { { GTEST_SKIP() << "Model '" << MODEL_NAME << "' has no variants configured"; } + + // Check that the GGUF file actually exists on disk + std::string filePath="/models/"+MODEL_FILE; + if(!std::filesystem::exists(filePath)) + { + GTEST_SKIP() << "Model file not found at " << filePath; + } } void TearDown() override @@ -210,6 +220,13 @@ TEST_F(LlamaProviderTest, ModelSwitching) GTEST_SKIP() << "Model '" << SMALL_MODEL_NAME << "' not found in config or has no variants"; } + // Verify the small model file exists + std::string smallFilePath="/models/"+SMALL_MODEL_FILE; + if(!std::filesystem::exists(smallFilePath)) + { + GTEST_SKIP() << "Small model file not found at " << smallFilePath; + } + // Create a client with the 7B model ChatConfig config; config.model=MODEL_NAME; @@ -256,4 +273,162 @@ TEST_F(LlamaProviderTest, ModelSwitching) EXPECT_NE(response1.model, response2.model); } +// ─── Config injection tests ────────────────────────────────────────────── + +static const std::string INJECTED_MODEL_NAME="injected-qwen-test"; + +class LlamaConfigInjectionTest : public ::testing::Test { +protected: + void SetUp() override + { + ModelRuntime::reset(); + TelemetryCollector::reset(); + + ArbiterAI &ai=ArbiterAI::instance(); + ai.initialize({"tests/config"}); + + // Use the small model file for injection tests (faster to load) + std::string filePath="/models/"+SMALL_MODEL_FILE; + if(!std::filesystem::exists(filePath)) + { + GTEST_SKIP() << "Small model file not found at " << filePath; + } + } + + void TearDown() override + { + ModelRuntime::instance().unloadModel(INJECTED_MODEL_NAME); + ModelRuntime::reset(); + TelemetryCollector::reset(); + } + + nlohmann::json buildInjectedModelJson() const + { + return nlohmann::json{ + {"model", INJECTED_MODEL_NAME}, + {"provider", "llama"}, + {"ranking", 1}, + {"version", "1.1.0"}, + {"context_window", 4096}, + {"max_tokens", 2048}, + {"max_output_tokens", 1024}, + {"hardware_requirements", { + {"min_system_ram_mb", 2048}, + {"parameter_count", "1.5B"} + }}, + {"context_scaling", { + {"base_context", 4096}, + {"max_context", 32768}, + {"vram_per_1k_context_mb", 32} + }}, + {"variants", nlohmann::json::array({ + { + {"quantization", "Q4_K_M"}, + {"file_size_mb", 941}, + {"min_vram_mb", 1536}, + {"recommended_vram_mb", 2048}, + {"download", { + {"url", ""}, + {"sha256", ""}, + {"filename", SMALL_MODEL_FILE} + }} + } + })} + }; + } +}; + +TEST_F(LlamaConfigInjectionTest, InjectAndLoad) +{ + nlohmann::json modelJson=buildInjectedModelJson(); + + std::string error; + bool added=ModelManager::instance().addModelFromJson(modelJson, error); + ASSERT_TRUE(added) << "addModelFromJson failed: " << error; + + // Verify it was registered + std::optional info=ModelManager::instance().getModelInfo(INJECTED_MODEL_NAME); + ASSERT_TRUE(info.has_value()); + EXPECT_EQ(info->provider, "llama"); + EXPECT_EQ(info->variants.size(), 1u); + EXPECT_EQ(info->variants[0].quantization, "Q4_K_M"); + EXPECT_EQ(info->variants[0].download.filename, SMALL_MODEL_FILE); + + // Load the injected model via ModelRuntime + ErrorCode loadResult=ModelRuntime::instance().loadModel(INJECTED_MODEL_NAME, "Q4_K_M", 4096); + EXPECT_EQ(loadResult, ErrorCode::Success); + + // Verify model state + std::optional state=ModelRuntime::instance().getModelState(INJECTED_MODEL_NAME); + ASSERT_TRUE(state.has_value()); + EXPECT_EQ(state->state, ModelState::Loaded); + EXPECT_EQ(state->variant, "Q4_K_M"); + EXPECT_NE(state->llamaModel, nullptr); + EXPECT_NE(state->llamaCtx, nullptr); +} + +TEST_F(LlamaConfigInjectionTest, InjectAndRunCompletion) +{ + nlohmann::json modelJson=buildInjectedModelJson(); + + std::string error; + bool added=ModelManager::instance().addModelFromJson(modelJson, error); + ASSERT_TRUE(added) << "addModelFromJson failed: " << error; + + // Create a ChatClient with the injected model + ChatConfig config; + config.model=INJECTED_MODEL_NAME; + config.maxTokens=32; + + std::shared_ptr client=ArbiterAI::instance().createChatClient(config); + ASSERT_NE(client, nullptr) << "Failed to create ChatClient for injected model"; + EXPECT_EQ(client->getModel(), INJECTED_MODEL_NAME); + + CompletionRequest request; + request.model=INJECTED_MODEL_NAME; + request.max_tokens=32; + request.messages={{"user", "What is 1+1? Answer with just the number."}}; + + CompletionResponse response; + ErrorCode result=client->completion(request, response); + + EXPECT_EQ(result, ErrorCode::Success); + EXPECT_FALSE(response.text.empty()); + EXPECT_EQ(response.provider, "llama"); + EXPECT_EQ(response.model, INJECTED_MODEL_NAME); + EXPECT_GT(response.usage.total_tokens, 0); +} + +TEST_F(LlamaConfigInjectionTest, InjectDuplicateFails) +{ + nlohmann::json modelJson=buildInjectedModelJson(); + + std::string error; + bool added=ModelManager::instance().addModelFromJson(modelJson, error); + ASSERT_TRUE(added) << "First injection failed: " << error; + + // Try to inject same model name again + bool addedAgain=ModelManager::instance().addModelFromJson(modelJson, error); + EXPECT_FALSE(addedAgain); + EXPECT_NE(error.find("already exists"), std::string::npos); +} + +TEST_F(LlamaConfigInjectionTest, InjectWithoutVariantsFails) +{ + nlohmann::json modelJson={ + {"model", "no-variants-llama"}, + {"provider", "llama"}, + {"ranking", 1}, + {"version", "1.1.0"} + }; + + std::string error; + bool added=ModelManager::instance().addModelFromJson(modelJson, error); + ASSERT_TRUE(added) << "addModelFromJson failed: " << error; + + // Loading a llama model without variants should fail + ErrorCode loadResult=ModelRuntime::instance().loadModel("no-variants-llama"); + EXPECT_EQ(loadResult, ErrorCode::InvalidRequest); +} + } // namespace arbiterAI