From bdf67f36921663adf65aa23206df21eff6c81167 Mon Sep 17 00:00:00 2001
From: Krazer <caseymcc@krazer.net>
Date: Thu, 26 Mar 2026 00:28:36 -0500
Subject: [PATCH] fix llama config

---
 src/arbiterAI/hardwareDetector.cpp |  29 ++++-
 src/server/routes.cpp              |  47 +++++---
 tests/llamaProviderTests.cpp       | 179 ++++++++++++++++++++++++++++-
 3 files changed, 231 insertions(+), 24 deletions(-)

diff --git a/src/arbiterAI/hardwareDetector.cpp b/src/arbiterAI/hardwareDetector.cpp
index d2cdd7c..0d292d0 100644
--- a/src/arbiterAI/hardwareDetector.cpp
+++ b/src/arbiterAI/hardwareDetector.cpp
@@ -559,6 +559,14 @@ void HardwareDetector::detectVulkanGpus()
         VkPhysicalDeviceProperties props{};
         getProperties(devices[i], &props);
 
+        // Skip CPU-based software renderers (e.g. llvmpipe) — they report
+        // system RAM as VRAM and are not useful for model inference.
+        if(props.deviceType==VK_PHYSICAL_DEVICE_TYPE_CPU)
+        {
+            spdlog::debug("Skipping Vulkan software renderer: {}", props.deviceName);
+            continue;
+        }
+
         // Skip devices already detected via NVML (match by name)
         bool alreadyDetected=false;
         for(const GpuInfo &existing:m_systemInfo.gpus)
@@ -665,8 +673,16 @@ void HardwareDetector::detectUnifiedMemory()
 
         if(matchedCardPath.empty())
         {
-            spdlog::debug("No amdgpu sysfs match for integrated GPU {}: {}",
-                gpu.index, gpu.name);
+            // No sysfs match — fall back to system RAM as GPU-accessible pool.
+            // Unified memory GPUs share system RAM, so the GPU can access most of it.
+            // Use total system RAM as the accessible pool estimate.
+            gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb;
+            gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb;
+
+            spdlog::info("Unified memory GPU {}: {} — no sysfs match, "
+                "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool",
+                gpu.index, gpu.name,
+                gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb);
             continue;
         }
 
@@ -730,7 +746,14 @@ void HardwareDetector::detectUnifiedMemory()
         }
         else
         {
-            spdlog::debug("No GTT info for integrated GPU {}: {}", gpu.index, gpu.name);
+            // sysfs card matched but no GTT info — fall back to system RAM
+            gpu.gpuAccessibleRamMb=m_systemInfo.totalRamMb;
+            gpu.gpuAccessibleRamFreeMb=m_systemInfo.freeRamMb;
+
+            spdlog::info("Unified memory GPU {}: {} — no GTT info, "
+                "falling back to system RAM ({}MB total, {}MB free) as GPU-accessible pool",
+                gpu.index, gpu.name,
+                gpu.gpuAccessibleRamMb, gpu.gpuAccessibleRamFreeMb);
         }
     }
 #endif
diff --git a/src/server/routes.cpp b/src/server/routes.cpp
index b0ca906..a3994d6 100644
--- a/src/server/routes.cpp
+++ b/src/server/routes.cpp
@@ -808,30 +808,39 @@ void handleGetLoadedModels(const httplib::Request &, httplib::Response &res)
 
 void handleLoadModel(const httplib::Request &req, httplib::Response &res)
 {
-    std::string modelName=req.matches[1];
-    std::string variant;
-    int contextSize=0;
+    try
+    {
+        std::string modelName=req.matches[1];
+        std::string variant;
+        int contextSize=0;
 
-    if(req.has_param("variant"))
-        variant=req.get_param_value("variant");
-    if(req.has_param("context"))
-        contextSize=std::stoi(req.get_param_value("context"));
+        if(req.has_param("variant"))
+            variant=req.get_param_value("variant");
+        if(req.has_param("context"))
+            contextSize=std::stoi(req.get_param_value("context"));
 
-    ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize);
+        ErrorCode err=ArbiterAI::instance().loadModel(modelName, variant, contextSize);
 
-    if(err==ErrorCode::Success)
-    {
-        res.set_content(nlohmann::json{{"status", "loaded"}, {"model", modelName}}.dump(), "application/json");
-    }
-    else if(err==ErrorCode::ModelDownloading)
-    {
-        res.status=202;
-        res.set_content(nlohmann::json{{"status", "downloading"}, {"model", modelName}}.dump(), "application/json");
+        if(err==ErrorCode::Success)
+        {
+            res.set_content(nlohmann::json{{"status", "loaded"}, {"model", modelName}}.dump(), "application/json");
+        }
+        else if(err==ErrorCode::ModelDownloading)
+        {
+            res.status=202;
+            res.set_content(nlohmann::json{{"status", "downloading"}, {"model", modelName}}.dump(), "application/json");
+        }
+        else
+        {
+            res.status=400;
+            res.set_content(errorJson("Failed to load model: "+errorCodeToString(err), "invalid_request_error", "model", errorCodeToString(err)).dump(), "application/json");
+        }
     }
-    else
+    catch(const std::exception &e)
     {
-        res.status=400;
-        res.set_content(errorJson("Failed to load model: "+errorCodeToString(err), "invalid_request_error", "model", errorCodeToString(err)).dump(), "application/json");
+        spdlog::error("Exception in handleLoadModel: {}", e.what());
+        res.status=500;
+        res.set_content(errorJson(std::string("Internal error: ")+e.what(), "server_error").dump(), "application/json");
     }
 }
 
diff --git a/tests/llamaProviderTests.cpp b/tests/llamaProviderTests.cpp
index 03541e1..b535c68 100644
--- a/tests/llamaProviderTests.cpp
+++ b/tests/llamaProviderTests.cpp
@@ -6,14 +6,17 @@
 #include "arbiterAI/telemetryCollector.h"
 #include "arbiterAI/modelManager.h"
 
+#include <nlohmann/json.hpp>
 #include <filesystem>
 #include <string>
 
 namespace arbiterAI
 {
 
-static const std::string MODEL_NAME="qwen2.5-7b-instruct";
-static const std::string SMALL_MODEL_NAME="qwen2.5-1.5b-instruct";
+static const std::string MODEL_NAME="Qwen2.5-7B-Instruct";
+static const std::string SMALL_MODEL_NAME="Qwen2.5-1.5B-Instruct";
+static const std::string MODEL_FILE="Qwen2.5-7B-Instruct-Q4_K_M.gguf";
+static const std::string SMALL_MODEL_FILE="Qwen2.5-1.5B-Instruct-Q4_K_M.gguf";
 
 class LlamaProviderTest : public ::testing::Test {
 protected:
@@ -37,6 +40,13 @@ class LlamaProviderTest : public ::testing::Test {
         {
             GTEST_SKIP() << "Model '" << MODEL_NAME << "' has no variants configured";
         }
+
+        // Check that the GGUF file actually exists on disk
+        std::string filePath="/models/"+MODEL_FILE;
+        if(!std::filesystem::exists(filePath))
+        {
+            GTEST_SKIP() << "Model file not found at " << filePath;
+        }
     }
 
     void TearDown() override
@@ -210,6 +220,13 @@ TEST_F(LlamaProviderTest, ModelSwitching)
         GTEST_SKIP() << "Model '" << SMALL_MODEL_NAME << "' not found in config or has no variants";
     }
 
+    // Verify the small model file exists
+    std::string smallFilePath="/models/"+SMALL_MODEL_FILE;
+    if(!std::filesystem::exists(smallFilePath))
+    {
+        GTEST_SKIP() << "Small model file not found at " << smallFilePath;
+    }
+
     // Create a client with the 7B model
     ChatConfig config;
     config.model=MODEL_NAME;
@@ -256,4 +273,162 @@ TEST_F(LlamaProviderTest, ModelSwitching)
     EXPECT_NE(response1.model, response2.model);
 }
 
+// ─── Config injection tests ──────────────────────────────────────────────
+
+static const std::string INJECTED_MODEL_NAME="injected-qwen-test";
+
+class LlamaConfigInjectionTest : public ::testing::Test {
+protected:
+    void SetUp() override
+    {
+        ModelRuntime::reset();
+        TelemetryCollector::reset();
+
+        ArbiterAI &ai=ArbiterAI::instance();
+        ai.initialize({"tests/config"});
+
+        // Use the small model file for injection tests (faster to load)
+        std::string filePath="/models/"+SMALL_MODEL_FILE;
+        if(!std::filesystem::exists(filePath))
+        {
+            GTEST_SKIP() << "Small model file not found at " << filePath;
+        }
+    }
+
+    void TearDown() override
+    {
+        ModelRuntime::instance().unloadModel(INJECTED_MODEL_NAME);
+        ModelRuntime::reset();
+        TelemetryCollector::reset();
+    }
+
+    nlohmann::json buildInjectedModelJson() const
+    {
+        return nlohmann::json{
+            {"model", INJECTED_MODEL_NAME},
+            {"provider", "llama"},
+            {"ranking", 1},
+            {"version", "1.1.0"},
+            {"context_window", 4096},
+            {"max_tokens", 2048},
+            {"max_output_tokens", 1024},
+            {"hardware_requirements", {
+                {"min_system_ram_mb", 2048},
+                {"parameter_count", "1.5B"}
+            }},
+            {"context_scaling", {
+                {"base_context", 4096},
+                {"max_context", 32768},
+                {"vram_per_1k_context_mb", 32}
+            }},
+            {"variants", nlohmann::json::array({
+                {
+                    {"quantization", "Q4_K_M"},
+                    {"file_size_mb", 941},
+                    {"min_vram_mb", 1536},
+                    {"recommended_vram_mb", 2048},
+                    {"download", {
+                        {"url", ""},
+                        {"sha256", ""},
+                        {"filename", SMALL_MODEL_FILE}
+                    }}
+                }
+            })}
+        };
+    }
+};
+
+TEST_F(LlamaConfigInjectionTest, InjectAndLoad)
+{
+    nlohmann::json modelJson=buildInjectedModelJson();
+
+    std::string error;
+    bool added=ModelManager::instance().addModelFromJson(modelJson, error);
+    ASSERT_TRUE(added) << "addModelFromJson failed: " << error;
+
+    // Verify it was registered
+    std::optional<ModelInfo> info=ModelManager::instance().getModelInfo(INJECTED_MODEL_NAME);
+    ASSERT_TRUE(info.has_value());
+    EXPECT_EQ(info->provider, "llama");
+    EXPECT_EQ(info->variants.size(), 1u);
+    EXPECT_EQ(info->variants[0].quantization, "Q4_K_M");
+    EXPECT_EQ(info->variants[0].download.filename, SMALL_MODEL_FILE);
+
+    // Load the injected model via ModelRuntime
+    ErrorCode loadResult=ModelRuntime::instance().loadModel(INJECTED_MODEL_NAME, "Q4_K_M", 4096);
+    EXPECT_EQ(loadResult, ErrorCode::Success);
+
+    // Verify model state
+    std::optional<LoadedModel> state=ModelRuntime::instance().getModelState(INJECTED_MODEL_NAME);
+    ASSERT_TRUE(state.has_value());
+    EXPECT_EQ(state->state, ModelState::Loaded);
+    EXPECT_EQ(state->variant, "Q4_K_M");
+    EXPECT_NE(state->llamaModel, nullptr);
+    EXPECT_NE(state->llamaCtx, nullptr);
+}
+
+TEST_F(LlamaConfigInjectionTest, InjectAndRunCompletion)
+{
+    nlohmann::json modelJson=buildInjectedModelJson();
+
+    std::string error;
+    bool added=ModelManager::instance().addModelFromJson(modelJson, error);
+    ASSERT_TRUE(added) << "addModelFromJson failed: " << error;
+
+    // Create a ChatClient with the injected model
+    ChatConfig config;
+    config.model=INJECTED_MODEL_NAME;
+    config.maxTokens=32;
+
+    std::shared_ptr<ChatClient> client=ArbiterAI::instance().createChatClient(config);
+    ASSERT_NE(client, nullptr) << "Failed to create ChatClient for injected model";
+    EXPECT_EQ(client->getModel(), INJECTED_MODEL_NAME);
+
+    CompletionRequest request;
+    request.model=INJECTED_MODEL_NAME;
+    request.max_tokens=32;
+    request.messages={{"user", "What is 1+1? Answer with just the number."}};
+
+    CompletionResponse response;
+    ErrorCode result=client->completion(request, response);
+
+    EXPECT_EQ(result, ErrorCode::Success);
+    EXPECT_FALSE(response.text.empty());
+    EXPECT_EQ(response.provider, "llama");
+    EXPECT_EQ(response.model, INJECTED_MODEL_NAME);
+    EXPECT_GT(response.usage.total_tokens, 0);
+}
+
+TEST_F(LlamaConfigInjectionTest, InjectDuplicateFails)
+{
+    nlohmann::json modelJson=buildInjectedModelJson();
+
+    std::string error;
+    bool added=ModelManager::instance().addModelFromJson(modelJson, error);
+    ASSERT_TRUE(added) << "First injection failed: " << error;
+
+    // Try to inject same model name again
+    bool addedAgain=ModelManager::instance().addModelFromJson(modelJson, error);
+    EXPECT_FALSE(addedAgain);
+    EXPECT_NE(error.find("already exists"), std::string::npos);
+}
+
+TEST_F(LlamaConfigInjectionTest, InjectWithoutVariantsFails)
+{
+    nlohmann::json modelJson={
+        {"model", "no-variants-llama"},
+        {"provider", "llama"},
+        {"ranking", 1},
+        {"version", "1.1.0"}
+    };
+
+    std::string error;
+    bool added=ModelManager::instance().addModelFromJson(modelJson, error);
+    ASSERT_TRUE(added) << "addModelFromJson failed: " << error;
+
+    // Loading a llama model without variants should fail
+    ErrorCode loadResult=ModelRuntime::instance().loadModel("no-variants-llama");
+    EXPECT_EQ(loadResult, ErrorCode::InvalidRequest);
+}
+
 } // namespace arbiterAI