From a66b2b0d2e6d65015e5204ea2eda91dc69cf4c95 Mon Sep 17 00:00:00 2001
From: vaiju1981 <vaiju1981@gmail.com>
Date: Tue, 30 Jun 2026 16:50:05 -0700
Subject: [PATCH] Add langchain4j-jllama module with in-process LangChain4j
 adapters

Introduce a separate Maven artifact that adapts a java-llama.cpp LlamaModel
to LangChain4j's model interfaces over JNI, with no HTTP hop:

- JllamaChatModel          -> ChatModel
- JllamaStreamingChatModel -> StreamingChatModel (token streaming)
- JllamaEmbeddingModel     -> EmbeddingModel
- JllamaScoringModel       -> ScoringModel (rerank; scores aligned by input index)

The adapters borrow a caller-owned LlamaModel and never close it. The module
depends on langchain4j-core 1.17.1, but the core net.ladenthin:llama binding
gains no langchain4j dependency, so plain users never pull it transitively.

It is kept as a sibling module (not part of the root reactor) so the native
build and release pipeline stay untouched, and it targets Java 17 to match the
langchain4j 1.x baseline.

The pure message/parameter/response transforms are unit-tested model-free; an
end-to-end chat and streaming test self-skips when no GGUF is provided. The
module README documents usage and the currently unmapped surfaces (tool
calling, multimodal user input).
---
 REUSE.toml                                    |   1 +
 langchain4j-jllama/README.md                  | 110 ++++++++++
 langchain4j-jllama/pom.xml                    |  94 +++++++++
 .../llama/langchain4j/JllamaChatModel.java    |  44 ++++
 .../langchain4j/JllamaEmbeddingModel.java     |  44 ++++
 .../llama/langchain4j/JllamaScoringModel.java |  49 +++++
 .../langchain4j/JllamaStreamingChatModel.java |  59 ++++++
 .../llama/langchain4j/LangChain4jMapping.java | 188 ++++++++++++++++++
 .../llama/langchain4j/package-info.java       |  21 ++
 .../JllamaChatModelIntegrationTest.java       |  92 +++++++++
 .../langchain4j/LangChain4jMappingTest.java   | 135 +++++++++++++
 11 files changed, 837 insertions(+)
 create mode 100644 langchain4j-jllama/README.md
 create mode 100644 langchain4j-jllama/pom.xml
 create mode 100644 langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaChatModel.java
 create mode 100644 langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaEmbeddingModel.java
 create mode 100644 langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaScoringModel.java
 create mode 100644 langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaStreamingChatModel.java
 create mode 100644 langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/LangChain4jMapping.java
 create mode 100644 langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/package-info.java
 create mode 100644 langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/JllamaChatModelIntegrationTest.java
 create mode 100644 langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/LangChain4jMappingTest.java

diff --git a/REUSE.toml b/REUSE.toml
index 7fc518c1..54d18d08 100644
--- a/REUSE.toml
+++ b/REUSE.toml
@@ -24,6 +24,7 @@ path = [
     ".github/ISSUE_TEMPLATE/bug_report.md",
     ".github/ISSUE_TEMPLATE/feature_request.md",
     ".claude/commands/find-cpp-duplication.md",
+    "langchain4j-jllama/README.md",
 ]
 SPDX-FileCopyrightText = [
     "2023-2025 Konstantin Herud",
diff --git a/langchain4j-jllama/README.md b/langchain4j-jllama/README.md
new file mode 100644
index 00000000..cb9d99bd
--- /dev/null
+++ b/langchain4j-jllama/README.md
@@ -0,0 +1,110 @@
+# langchain4j-jllama
+
+[LangChain4j](https://github.com/langchain4j/langchain4j) adapters backed by an **in-process**
+[java-llama.cpp](https://github.com/bernardladenthin/java-llama.cpp) model over JNI — no HTTP server,
+no separate process.
+
+This is a **separate Maven artifact** on purpose: it depends on `langchain4j-core`, but the core
+`net.ladenthin:llama` binding does **not** depend on langchain4j, so plain java-llama.cpp users never
+pull langchain4j (or its Java 17 floor) transitively.
+
+> **Already have an OpenAI-compatible setup?** java-llama.cpp also ships
+> `net.ladenthin.llama.server.OpenAiCompatServer`, so you can point langchain4j's `langchain4j-open-ai`
+> client at a running server with zero code from this module. Use *this* module when you want the
+> in-process path (no HTTP hop, single process — e.g. desktop/Android/embedded).
+
+## Adapters
+
+| Class | langchain4j interface | java-llama.cpp call |
+|-------|-----------------------|---------------------|
+| `JllamaChatModel` | `ChatModel` | `LlamaModel.chat(...)` |
+| `JllamaStreamingChatModel` | `StreamingChatModel` | `LlamaModel.generateChat(...)` (token streaming) |
+| `JllamaEmbeddingModel` | `EmbeddingModel` | `LlamaModel.embed(...)` |
+| `JllamaScoringModel` | `ScoringModel` (re-ranking) | `LlamaModel.handleRerank(...)` |
+
+## Lifecycle: the model is *borrowed*
+
+Every adapter takes a `LlamaModel` you already loaded and **keeps owning**. The adapter never loads
+or closes the native model — you manage it (try-with-resources or explicit `close()`). One
+`LlamaModel` can back several adapters at once.
+
+```java
+try (LlamaModel llama = new LlamaModel(new ModelParameters().setModel("models/qwen3-0.6b.gguf"))) {
+    ChatModel chat = new JllamaChatModel(llama);
+
+    String reply = chat.chat("Write a haiku about lazy senior devs.");
+    System.out.println(reply);
+}
+```
+
+Streaming:
+
+```java
+StreamingChatModel chat = new JllamaStreamingChatModel(llama);
+chat.chat("Tell me a story.", new StreamingChatResponseHandler() {
+    @Override public void onPartialResponse(String token) { System.out.print(token); }
+    @Override public void onCompleteResponse(ChatResponse response) { /* done */ }
+    @Override public void onError(Throwable error) { error.printStackTrace(); }
+});
+```
+
+Embeddings (model loaded with `enableEmbedding()`) and re-ranking
+(`enableReranking()`) plug straight into langchain4j RAG:
+
+```java
+EmbeddingModel embeddings = new JllamaEmbeddingModel(embeddingLlama);
+ScoringModel reranker     = new JllamaScoringModel(rerankLlama);
+```
+
+## Dependency
+
+```xml
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>langchain4j-jllama</artifactId>
+    <version>5.0.4-SNAPSHOT</version>
+</dependency>
+```
+
+`langchain4j-core` is pulled transitively. You still supply a java-llama.cpp native library for your
+platform the usual way (bundled in the `net.ladenthin:llama` JAR or on `java.library.path`).
+
+## Building
+
+This is a **sibling module**, not part of the root reactor. Install the core artifact first, then
+build here:
+
+```bash
+# from the repo root: publish the core net.ladenthin:llama jar to your local ~/.m2
+mvn -DskipTests install
+
+# then build/test this module
+cd langchain4j-jllama
+mvn test
+```
+
+The end-to-end test (`JllamaChatModelIntegrationTest`) self-skips unless you pass a model:
+
+```bash
+mvn test -Dnet.ladenthin.llama.model.path=/abs/path/to/model.gguf
+```
+
+## Not mapped yet
+
+- **Tool calling.** `ChatRequest.toolSpecifications()` are not forwarded, so the chat adapters return
+  assistant *text*, not `AiMessage.toolExecutionRequests()`. (java-llama.cpp itself supports tool
+  calling via `LlamaModel.chatWithTools` / typed `ToolDefinition`; bridging that to langchain4j
+  `ToolSpecification` is the planned next step.)
+- **Multimodal user input.** A multi-content `UserMessage` is flattened to its text parts; image/audio
+  content is dropped.
+- **Per-token tool-call / thinking stream events.** Streaming forwards plain text via
+  `onPartialResponse`.
+- **`response_format` (JSON mode).** `ChatRequest.responseFormat()` (json_object / json_schema) is not
+  forwarded; `modelName()` is ignored since one model is bound per adapter.
+
+Mapped request parameters: `temperature`, `topP`, `topK`, `maxOutputTokens`, `frequencyPenalty`,
+`presencePenalty`, `stopSequences`. The non-streaming chat response carries the model's real finish
+reason (`stop`/`length`/`tool_calls`) and token usage; the streaming completion carries assembled text
+(no per-token usage).
+
+Requires Java 17+ (langchain4j 1.x baseline). Targets `langchain4j-core` 1.17.1.
diff --git a/langchain4j-jllama/pom.xml b/langchain4j-jllama/pom.xml
new file mode 100644
index 00000000..58f6d365
--- /dev/null
+++ b/langchain4j-jllama/pom.xml
@@ -0,0 +1,94 @@
+<!--
+SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+
+SPDX-License-Identifier: MIT
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<groupId>net.ladenthin</groupId>
+	<artifactId>langchain4j-jllama</artifactId>
+	<version>5.0.4-SNAPSHOT</version>
+	<packaging>jar</packaging>
+
+	<name>${project.groupId}:${project.artifactId}</name>
+	<description>LangChain4j integration for java-llama.cpp: in-process ChatModel,
+		StreamingChatModel, EmbeddingModel and ScoringModel adapters backed by a
+		llama.cpp model over JNI (no HTTP hop).</description>
+	<url>https://github.com/bernardladenthin/java-llama.cpp</url>
+
+	<licenses>
+		<license>
+			<name>MIT License</name>
+			<url>https://www.opensource.org/licenses/mit-license.php</url>
+			<distribution>repo</distribution>
+		</license>
+	</licenses>
+
+	<developers>
+		<developer>
+			<name>Bernard Ladenthin</name>
+			<organizationUrl>https://github.com/bernardladenthin</organizationUrl>
+		</developer>
+	</developers>
+
+	<scm>
+		<connection>scm:git:https://github.com/bernardladenthin/java-llama.cpp.git</connection>
+		<developerConnection>scm:git:https://github.com/bernardladenthin/java-llama.cpp.git</developerConnection>
+		<url>https://github.com/bernardladenthin/java-llama.cpp/tree/main</url>
+	</scm>
+
+	<properties>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+		<maven.compiler.release>17</maven.compiler.release>
+		<!-- Keep in lockstep with the core java-llama.cpp artifact version. -->
+		<jllama.version>5.0.4-SNAPSHOT</jllama.version>
+		<langchain4j.version>1.17.1</langchain4j.version>
+		<junit.version>6.1.1</junit.version>
+		<hamcrest.version>3.0</hamcrest.version>
+		<surefire.version>3.5.5</surefire.version>
+	</properties>
+
+	<dependencies>
+		<!-- The JNI binding we adapt. Provided-by-the-consumer in spirit, but compile
+		     scope so a consumer that only declares langchain4j-jllama still gets it. -->
+		<dependency>
+			<groupId>net.ladenthin</groupId>
+			<artifactId>llama</artifactId>
+			<version>${jllama.version}</version>
+		</dependency>
+
+		<!-- The interfaces we implement (ChatModel/StreamingChatModel/EmbeddingModel/ScoringModel). -->
+		<dependency>
+			<groupId>dev.langchain4j</groupId>
+			<artifactId>langchain4j-core</artifactId>
+			<version>${langchain4j.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.junit.jupiter</groupId>
+			<artifactId>junit-jupiter</artifactId>
+			<version>${junit.version}</version>
+			<scope>test</scope>
+		</dependency>
+		<dependency>
+			<groupId>org.hamcrest</groupId>
+			<artifactId>hamcrest</artifactId>
+			<version>${hamcrest.version}</version>
+			<scope>test</scope>
+		</dependency>
+	</dependencies>
+
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-surefire-plugin</artifactId>
+				<version>${surefire.version}</version>
+			</plugin>
+		</plugins>
+	</build>
+</project>
diff --git a/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaChatModel.java b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaChatModel.java
new file mode 100644
index 00000000..dcade59f
--- /dev/null
+++ b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaChatModel.java
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.model.chat.ChatModel;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * langchain4j {@link ChatModel} backed by an in-process java-llama.cpp model (over JNI, no HTTP).
+ *
+ * <p>The model is <em>borrowed</em>: this adapter never loads or closes it. Construct it from a
+ * {@link LlamaModel} you already own and keep managing that model's lifecycle (try-with-resources or
+ * an explicit {@code close()}). One {@code LlamaModel} can back several adapters at once.
+ *
+ * <p>Mapped today: messages (system/user/assistant/tool-result) and the sampling parameters
+ * {@code temperature}/{@code topP}/{@code topK}/{@code maxOutputTokens}/{@code stopSequences}.
+ * Tool <em>specifications</em> on the request are not yet forwarded, so this returns assistant text,
+ * not tool calls — see the module README for the planned tool-calling bridge.
+ */
+public final class JllamaChatModel implements ChatModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates a chat model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded model to drive; not closed by this adapter
+     */
+    public JllamaChatModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public ChatResponse doChat(ChatRequest chatRequest) {
+        net.ladenthin.llama.value.ChatResponse response =
+                model.chat(LangChain4jMapping.toJllamaRequest(chatRequest));
+        return LangChain4jMapping.toLangChainResponse(response);
+    }
+}
diff --git a/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaEmbeddingModel.java b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaEmbeddingModel.java
new file mode 100644
index 00000000..9a4b965f
--- /dev/null
+++ b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaEmbeddingModel.java
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.model.embedding.EmbeddingModel;
+import dev.langchain4j.model.output.Response;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * langchain4j {@link EmbeddingModel} backed by an in-process java-llama.cpp model.
+ *
+ * <p>The backing {@link LlamaModel} must be loaded in embedding mode
+ * ({@code ModelParameters.enableEmbedding()}). The model is <em>borrowed</em> (never closed here) —
+ * see {@link JllamaChatModel}.
+ */
+public final class JllamaEmbeddingModel implements EmbeddingModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates an embedding model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded embedding-mode model to drive; not closed by this adapter
+     */
+    public JllamaEmbeddingModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public Response<List<Embedding>> embedAll(List<TextSegment> textSegments) {
+        List<Embedding> embeddings = new ArrayList<>(textSegments.size());
+        for (TextSegment segment : textSegments) {
+            embeddings.add(Embedding.from(model.embed(segment.text())));
+        }
+        return Response.from(embeddings);
+    }
+}
diff --git a/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaScoringModel.java b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaScoringModel.java
new file mode 100644
index 00000000..37473c28
--- /dev/null
+++ b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaScoringModel.java
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.model.output.Response;
+import dev.langchain4j.model.scoring.ScoringModel;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * langchain4j {@link ScoringModel} (re-ranker) backed by an in-process java-llama.cpp model.
+ *
+ * <p>Maps onto java-llama.cpp's native rerank endpoint, so the backing {@link LlamaModel} must be
+ * loaded in reranking mode ({@code ModelParameters.enableReranking()}). Scores are returned in the
+ * same order as the input segments. The model is <em>borrowed</em> (never closed here) — see
+ * {@link JllamaChatModel}.
+ */
+public final class JllamaScoringModel implements ScoringModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates a scoring model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded reranking-mode model to drive; not closed by this adapter
+     */
+    public JllamaScoringModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public Response<List<Double>> scoreAll(List<TextSegment> segments, String query) {
+        String[] documents = new String[segments.size()];
+        for (int i = 0; i < segments.size(); i++) {
+            documents[i] = segments.get(i).text();
+        }
+        double[] scores = LangChain4jMapping.parseRerankScores(model.handleRerank(query, documents), documents.length);
+        List<Double> result = new ArrayList<>(scores.length);
+        for (double score : scores) {
+            result.add(score);
+        }
+        return Response.from(result);
+    }
+}
diff --git a/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaStreamingChatModel.java b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaStreamingChatModel.java
new file mode 100644
index 00000000..9bf2124a
--- /dev/null
+++ b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/JllamaStreamingChatModel.java
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.data.message.AiMessage;
+import dev.langchain4j.model.chat.StreamingChatModel;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import dev.langchain4j.model.chat.response.StreamingChatResponseHandler;
+import dev.langchain4j.model.output.FinishReason;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaIterable;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.value.LlamaOutput;
+
+/**
+ * langchain4j {@link StreamingChatModel} backed by an in-process java-llama.cpp model.
+ *
+ * <p>Each generated token is forwarded to {@link StreamingChatResponseHandler#onPartialResponse}; a
+ * final {@link StreamingChatResponseHandler#onCompleteResponse} carries the assembled assistant
+ * message. Any failure during generation is reported via {@link StreamingChatResponseHandler#onError}.
+ *
+ * <p>The model is <em>borrowed</em> (never closed here) — see {@link JllamaChatModel}. Tool
+ * specifications are not yet forwarded; this streams plain assistant text.
+ */
+public final class JllamaStreamingChatModel implements StreamingChatModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates a streaming chat model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded model to drive; not closed by this adapter
+     */
+    public JllamaStreamingChatModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public void doChat(ChatRequest chatRequest, StreamingChatResponseHandler handler) {
+        StringBuilder full = new StringBuilder();
+        try (LlamaIterable stream = model.generateChat(LangChain4jMapping.toStreamingParameters(chatRequest))) {
+            for (LlamaOutput output : stream) {
+                full.append(output.text);
+                handler.onPartialResponse(output.text);
+            }
+        } catch (Exception e) {
+            handler.onError(e);
+            return;
+        }
+        handler.onCompleteResponse(
+                ChatResponse.builder()
+                        .aiMessage(AiMessage.from(full.toString()))
+                        .finishReason(FinishReason.STOP)
+                        .build());
+    }
+}
diff --git a/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/LangChain4jMapping.java b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/LangChain4jMapping.java
new file mode 100644
index 00000000..da0ca32b
--- /dev/null
+++ b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/LangChain4jMapping.java
@@ -0,0 +1,188 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import dev.langchain4j.data.message.AiMessage;
+import dev.langchain4j.data.message.ChatMessage;
+import dev.langchain4j.data.message.Content;
+import dev.langchain4j.data.message.ContentType;
+import dev.langchain4j.data.message.SystemMessage;
+import dev.langchain4j.data.message.TextContent;
+import dev.langchain4j.data.message.ToolExecutionResultMessage;
+import dev.langchain4j.data.message.UserMessage;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import dev.langchain4j.model.output.FinishReason;
+import dev.langchain4j.model.output.TokenUsage;
+import java.io.IOException;
+import java.util.List;
+import net.ladenthin.llama.json.RerankResponseParser;
+import net.ladenthin.llama.parameters.InferenceParameters;
+
+/**
+ * Pure (model-free) translation between langchain4j chat types and java-llama.cpp parameters.
+ *
+ * <p>Every method here is a deterministic data transform with no JNI and no loaded model, so the
+ * mapping is unit-testable on its own (see {@code LangChain4jMappingTest}). The adapters keep the
+ * live-model calls; this class only reshapes their inputs and outputs.
+ */
+final class LangChain4jMapping {
+
+    private LangChain4jMapping() {}
+
+    /**
+     * Build a java-llama.cpp typed chat request from a langchain4j chat request. Messages map by
+     * role; sampling parameters ({@code temperature}/{@code topP}/{@code topK}/{@code
+     * maxOutputTokens}/{@code stopSequences}) ride along as an inference customizer.
+     */
+    static net.ladenthin.llama.parameters.ChatRequest toJllamaRequest(ChatRequest request) {
+        net.ladenthin.llama.parameters.ChatRequest jllama =
+                net.ladenthin.llama.parameters.ChatRequest.empty();
+        for (ChatMessage message : request.messages()) {
+            jllama = jllama.appendMessage(toJllamaMessage(message));
+        }
+        return jllama.withInferenceCustomizer(params -> applySampling(params, request));
+    }
+
+    /**
+     * Build the streaming inference parameters (messages JSON + sampling) for {@code generateChat}.
+     * Shares {@link #toJllamaRequest(ChatRequest)} so blocking and streaming stay in lockstep.
+     */
+    static InferenceParameters toStreamingParameters(ChatRequest request) {
+        net.ladenthin.llama.parameters.ChatRequest jllama = toJllamaRequest(request);
+        InferenceParameters params =
+                InferenceParameters.empty().withMessagesJson(jllama.buildMessagesJson());
+        return jllama.applyCustomizer(params);
+    }
+
+    /** Wrap a java-llama.cpp chat result as a langchain4j {@link ChatResponse}. */
+    static ChatResponse toLangChainResponse(net.ladenthin.llama.value.ChatResponse response) {
+        ChatResponse.Builder builder =
+                ChatResponse.builder().aiMessage(AiMessage.from(response.getFirstContent()));
+        net.ladenthin.llama.value.Usage usage = response.getUsage();
+        if (usage != null) {
+            builder.tokenUsage(
+                    new TokenUsage((int) usage.getPromptTokens(), (int) usage.getCompletionTokens()));
+        }
+        List<net.ladenthin.llama.value.ChatChoice> choices = response.getChoices();
+        String finishReason = choices.isEmpty() ? null : choices.get(0).getFinishReason();
+        return builder.finishReason(toFinishReason(finishReason)).build();
+    }
+
+    /**
+     * Map java-llama.cpp's OpenAI-style finish-reason string to the langchain4j enum. A {@code null}
+     * (no choices / reason absent) is treated as a normal {@code STOP}; an unrecognized value maps to
+     * {@code OTHER} rather than guessing.
+     */
+    static FinishReason toFinishReason(String reason) {
+        if (reason == null) {
+            return FinishReason.STOP;
+        }
+        switch (reason) {
+            case "stop":
+                return FinishReason.STOP;
+            case "length":
+                return FinishReason.LENGTH;
+            case "tool_calls":
+                return FinishReason.TOOL_EXECUTION;
+            case "content_filter":
+                return FinishReason.CONTENT_FILTER;
+            default:
+                return FinishReason.OTHER;
+        }
+    }
+
+    /**
+     * Align native rerank scores to input order. The native response is a JSON array of
+     * {@code {document, index, score}} objects whose {@code index} is the position in the input
+     * documents array; results may arrive in any order, so we place each score at its index.
+     *
+     * @param json the raw native rerank JSON array
+     * @param count the number of input documents (output length)
+     * @return scores indexed by input position; positions absent from the response stay {@code 0.0}
+     */
+    static double[] parseRerankScores(String json, int count) {
+        double[] scores = new double[count];
+        try {
+            JsonNode array = RerankResponseParser.OBJECT_MAPPER.readTree(json);
+            if (array.isArray()) {
+                int position = 0;
+                for (JsonNode entry : array) {
+                    // "index" is the input position; fall back to array order when the field is
+                    // absent so a response without it never silently yields all-zero scores.
+                    int index = entry.path("index").asInt(position);
+                    if (index >= 0 && index < count) {
+                        scores[index] = entry.path("score").asDouble(0.0);
+                    }
+                    position++;
+                }
+            }
+        } catch (IOException e) {
+            throw new IllegalStateException("Failed to parse rerank response", e);
+        }
+        return scores;
+    }
+
+    private static net.ladenthin.llama.value.ChatMessage toJllamaMessage(ChatMessage message) {
+        switch (message.type()) {
+            case SYSTEM:
+                return new net.ladenthin.llama.value.ChatMessage(
+                        "system", ((SystemMessage) message).text());
+            case USER:
+                return new net.ladenthin.llama.value.ChatMessage("user", userText((UserMessage) message));
+            case AI:
+                String aiText = ((AiMessage) message).text();
+                return new net.ladenthin.llama.value.ChatMessage(
+                        "assistant", aiText == null ? "" : aiText);
+            case TOOL_EXECUTION_RESULT:
+                ToolExecutionResultMessage tool = (ToolExecutionResultMessage) message;
+                return net.ladenthin.llama.value.ChatMessage.toolResult(tool.id(), tool.text());
+            default:
+                // CUSTOM and any future type: no faithful chat-role mapping exists.
+                throw new IllegalArgumentException("Unsupported message type: " + message.type());
+        }
+    }
+
+    /** Flatten a (possibly multimodal) user message to text; non-text parts (images) are dropped. */
+    private static String userText(UserMessage message) {
+        if (message.hasSingleText()) {
+            return message.singleText();
+        }
+        StringBuilder text = new StringBuilder();
+        for (Content content : message.contents()) {
+            if (content.type() == ContentType.TEXT) {
+                text.append(((TextContent) content).text());
+            }
+        }
+        return text.toString();
+    }
+
+    private static InferenceParameters applySampling(InferenceParameters params, ChatRequest request) {
+        if (request.temperature() != null) {
+            params = params.withTemperature(request.temperature().floatValue());
+        }
+        if (request.topP() != null) {
+            params = params.withTopP(request.topP().floatValue());
+        }
+        if (request.topK() != null) {
+            params = params.withTopK(request.topK());
+        }
+        if (request.maxOutputTokens() != null) {
+            params = params.withNPredict(request.maxOutputTokens());
+        }
+        if (request.frequencyPenalty() != null) {
+            params = params.withFrequencyPenalty(request.frequencyPenalty().floatValue());
+        }
+        if (request.presencePenalty() != null) {
+            params = params.withPresencePenalty(request.presencePenalty().floatValue());
+        }
+        List<String> stops = request.stopSequences();
+        if (stops != null && !stops.isEmpty()) {
+            params = params.withStopStrings(stops.toArray(new String[0]));
+        }
+        return params;
+    }
+}
diff --git a/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/package-info.java b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/package-info.java
new file mode 100644
index 00000000..28310e04
--- /dev/null
+++ b/langchain4j-jllama/src/main/java/net/ladenthin/llama/langchain4j/package-info.java
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+/**
+ * langchain4j adapters backed by an in-process java-llama.cpp {@link net.ladenthin.llama.LlamaModel}
+ * over JNI — no HTTP server, no separate process.
+ *
+ * <ul>
+ *   <li>{@link net.ladenthin.llama.langchain4j.JllamaChatModel} — {@code ChatModel}</li>
+ *   <li>{@link net.ladenthin.llama.langchain4j.JllamaStreamingChatModel} — {@code StreamingChatModel}</li>
+ *   <li>{@link net.ladenthin.llama.langchain4j.JllamaEmbeddingModel} — {@code EmbeddingModel}</li>
+ *   <li>{@link net.ladenthin.llama.langchain4j.JllamaScoringModel} — {@code ScoringModel} (re-ranking)</li>
+ * </ul>
+ *
+ * <p>Every adapter <em>borrows</em> a model the caller has already loaded and keeps owning: the
+ * adapter never loads or closes the native model. This artifact depends on {@code langchain4j-core}
+ * but the core {@code net.ladenthin:llama} binding does not depend on langchain4j, so plain
+ * java-llama.cpp users never pull langchain4j transitively.
+ */
+package net.ladenthin.llama.langchain4j;
diff --git a/langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/JllamaChatModelIntegrationTest.java b/langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/JllamaChatModelIntegrationTest.java
new file mode 100644
index 00000000..ea4b18ec
--- /dev/null
+++ b/langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/JllamaChatModelIntegrationTest.java
@@ -0,0 +1,92 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.notNullValue;
+
+import dev.langchain4j.data.message.UserMessage;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import dev.langchain4j.model.chat.response.StreamingChatResponseHandler;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.junit.jupiter.api.Assumptions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * End-to-end smoke test over a real model. Self-skips unless a GGUF is provided via
+ * {@code -Dnet.ladenthin.llama.model.path=/abs/path/to/model.gguf} (and the native library is on
+ * the path), mirroring the core project's model-gated tests, so a model-free checkout stays green.
+ */
+class JllamaChatModelIntegrationTest {
+
+    private static Path modelPath() {
+        String path = System.getProperty("net.ladenthin.llama.model.path");
+        Assumptions.assumeTrue(path != null && !path.isEmpty(), "model path property not set");
+        Path resolved = Paths.get(path);
+        Assumptions.assumeTrue(Files.exists(resolved), "model file not present: " + resolved);
+        return resolved;
+    }
+
+    @Test
+    void chatReturnsAssistantText() {
+        Path model = modelPath();
+        try (LlamaModel llama = new LlamaModel(new ModelParameters().setModel(model.toString()))) {
+            JllamaChatModel chat = new JllamaChatModel(llama);
+
+            ChatResponse response =
+                    chat.chat(
+                            ChatRequest.builder()
+                                    .messages(UserMessage.from("Reply with the single word: ok"))
+                                    .maxOutputTokens(8)
+                                    .build());
+
+            assertThat(response.aiMessage(), is(notNullValue()));
+            assertThat(response.aiMessage().text(), is(notNullValue()));
+        }
+    }
+
+    @Test
+    void streamingDeliversTokensThenCompletes() throws Exception {
+        Path model = modelPath();
+        try (LlamaModel llama = new LlamaModel(new ModelParameters().setModel(model.toString()))) {
+            JllamaStreamingChatModel streaming = new JllamaStreamingChatModel(llama);
+            StringBuilder streamed = new StringBuilder();
+            CompletableFuture<ChatResponse> done = new CompletableFuture<>();
+
+            streaming.chat(
+                    ChatRequest.builder()
+                            .messages(UserMessage.from("Reply with the single word: ok"))
+                            .maxOutputTokens(8)
+                            .build(),
+                    new StreamingChatResponseHandler() {
+                        @Override
+                        public void onPartialResponse(String partial) {
+                            streamed.append(partial);
+                        }
+
+                        @Override
+                        public void onCompleteResponse(ChatResponse complete) {
+                            done.complete(complete);
+                        }
+
+                        @Override
+                        public void onError(Throwable error) {
+                            done.completeExceptionally(error);
+                        }
+                    });
+
+            ChatResponse complete = done.get(60, TimeUnit.SECONDS);
+            assertThat(complete.aiMessage().text(), is(streamed.toString()));
+        }
+    }
+}
diff --git a/langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/LangChain4jMappingTest.java b/langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/LangChain4jMappingTest.java
new file mode 100644
index 00000000..745213c0
--- /dev/null
+++ b/langchain4j-jllama/src/test/java/net/ladenthin/llama/langchain4j/LangChain4jMappingTest.java
@@ -0,0 +1,135 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+
+import dev.langchain4j.data.message.AiMessage;
+import dev.langchain4j.data.message.SystemMessage;
+import dev.langchain4j.data.message.TextContent;
+import dev.langchain4j.data.message.ToolExecutionResultMessage;
+import dev.langchain4j.data.message.UserMessage;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.output.FinishReason;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import net.ladenthin.llama.parameters.InferenceParameters;
+import net.ladenthin.llama.value.ChatMessage;
+import org.junit.jupiter.api.Test;
+
+/** Model-free tests for the pure langchain4j&lt;-&gt;java-llama.cpp transforms. */
+class LangChain4jMappingTest {
+
+    @Test
+    void mapsEveryRoleAndContent() {
+        ChatRequest request =
+                ChatRequest.builder()
+                        .messages(
+                                SystemMessage.from("you are terse"),
+                                UserMessage.from("hi"),
+                                AiMessage.from("hello"),
+                                ToolExecutionResultMessage.from("call_1", "search", "42"))
+                        .build();
+
+        List<ChatMessage> messages = LangChain4jMapping.toJllamaRequest(request).getMessages();
+
+        List<String> roles = new ArrayList<>();
+        List<String> contents = new ArrayList<>();
+        for (ChatMessage message : messages) {
+            roles.add(message.getRole());
+            contents.add(message.getContent());
+        }
+        assertThat(roles, contains("system", "user", "assistant", "tool"));
+        assertThat(contents, contains("you are terse", "hi", "hello", "42"));
+    }
+
+    @Test
+    void flattensMultimodalUserMessageToText() {
+        ChatRequest request =
+                ChatRequest.builder()
+                        .messages(UserMessage.from(TextContent.from("Hello "), TextContent.from("world")))
+                        .build();
+
+        ChatMessage mapped = LangChain4jMapping.toJllamaRequest(request).getMessages().get(0);
+
+        assertThat(mapped.getRole(), is("user"));
+        assertThat(mapped.getContent(), is("Hello world"));
+    }
+
+    @Test
+    void appliesSamplingParametersToInferenceJson() {
+        ChatRequest request =
+                ChatRequest.builder()
+                        .messages(UserMessage.from("hi"))
+                        .temperature(0.3)
+                        .topK(40)
+                        .maxOutputTokens(64)
+                        .frequencyPenalty(0.5)
+                        .presencePenalty(0.25)
+                        .stopSequences(Arrays.asList("STOP"))
+                        .build();
+
+        String json = LangChain4jMapping.toStreamingParameters(request).toString();
+
+        assertThat(json, containsString("\"temperature\""));
+        assertThat(json, containsString("\"top_k\""));
+        assertThat(json, containsString("\"n_predict\""));
+        assertThat(json, containsString("\"frequency_penalty\""));
+        assertThat(json, containsString("\"presence_penalty\""));
+        assertThat(json, containsString("\"stop\""));
+        // Messages must survive into the streaming parameter blob too.
+        assertThat(json, containsString("hi"));
+    }
+
+    @Test
+    void mapsFinishReasonStrings() {
+        assertThat(LangChain4jMapping.toFinishReason("stop"), is(FinishReason.STOP));
+        assertThat(LangChain4jMapping.toFinishReason("length"), is(FinishReason.LENGTH));
+        assertThat(LangChain4jMapping.toFinishReason("tool_calls"), is(FinishReason.TOOL_EXECUTION));
+        assertThat(LangChain4jMapping.toFinishReason("content_filter"), is(FinishReason.CONTENT_FILTER));
+        assertThat(LangChain4jMapping.toFinishReason("something_new"), is(FinishReason.OTHER));
+        // No choices / absent reason is the normal terminal state.
+        assertThat(LangChain4jMapping.toFinishReason(null), is(FinishReason.STOP));
+    }
+
+    @Test
+    void rerankScoresAlignToInputOrderNotResponseOrder() {
+        // Native results arrive out of order; "index" is the input position.
+        String json =
+                "[{\"document\":\"b\",\"index\":1,\"score\":0.9},"
+                        + "{\"document\":\"a\",\"index\":0,\"score\":0.1}]";
+
+        double[] scores = LangChain4jMapping.parseRerankScores(json, 2);
+
+        assertThat(scores.length, is(2));
+        assertThat(scores[0], is(0.1));
+        assertThat(scores[1], is(0.9));
+    }
+
+    @Test
+    void rerankScoresDefaultToZeroForMissingEntries() {
+        double[] scores = LangChain4jMapping.parseRerankScores("[]", 3);
+
+        assertThat(scores.length, is(3));
+        assertThat(scores[0], is(0.0));
+        assertThat(scores[1], is(0.0));
+        assertThat(scores[2], is(0.0));
+    }
+
+    @Test
+    void rerankScoresFallBackToArrayOrderWhenIndexAbsent() {
+        // No "index" field: array position is used, so scores are not silently all-zero.
+        String json = "[{\"document\":\"a\",\"score\":0.7},{\"document\":\"b\",\"score\":0.2}]";
+
+        double[] scores = LangChain4jMapping.parseRerankScores(json, 2);
+
+        assertThat(scores[0], is(0.7));
+        assertThat(scores[1], is(0.2));
+    }
+}