Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
title: Fix semaphore permit leaks in Http2SolrClient's AsyncTracker. Avoid IO-thread deadlock on connection failure retries. Add a new metric gauge solr_client_request_async_permits
type: fixed
authors:
- name: Jan Høydahl
url: https://home.apache.org/phonebook.html?uid=janhoy
links:
- name: SOLR-18174
url: https://issues.apache.org/jira/browse/SOLR-18174
Original file line number Diff line number Diff line change
Expand Up @@ -445,5 +445,19 @@ public void initializeMetrics(SolrMetricsContext parentContext, String scope) {
null,
solrMetricsContext.getMetricRegistry(),
SolrMetricManager.mkName("httpShardExecutor", expandedScope, "threadPool"));
if (defaultClient != null) {
solrMetricsContext.gauge(
defaultClient::asyncTrackerAvailablePermits,
true,
"asyncPermits.available",
Comment thread
janhoy marked this conversation as resolved.
expandedScope,
"threadPool");
solrMetricsContext.gauge(
defaultClient::asyncTrackerMaxPermits,
true,
"asyncPermits.max",
expandedScope,
"threadPool");
}
}
}

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions solr/prometheus-exporter/conf/solr-exporter-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,20 @@
value : $value
}
</str>
<str>
.metrics["solr.node"] | to_entries | .[] |
select(.key | startswith("QUERY.httpShardHandler.threadPool.asyncPermits.")) as $object |
$object.key | split(".") | last as $state |
$object.value as $value |
{
name : "solr_client_request_async_permits",
type : "GAUGE",
help : "See following URL: https://solr.apache.org/guide/solr/latest/deployment-guide/metrics-reporting.html",
label_names : ["state"],
label_values : [$state],
value : $value
}
</str>

<!--
Query related core metrics; see jq-templates for details on the core-query template used below
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,19 @@ example.solr.jetty.org.eclipse.jetty.server.handler.DefaultHandler.2xx-responses
example.solr.jetty.org.eclipse.jetty.server.handler.DefaultHandler.2xx-responses.mean_rate 0.5698031798408144 1482932097
----

=== HTTP Client Registry

Solr exposes metrics for the internal Jetty-based HTTP client used for distributed (shard) requests.
These gauges are registered under the `QUERY` category in the node metrics registry, with Dropwizard keys
`QUERY.httpShardHandler.threadPool.asyncPermits.available` and `QUERY.httpShardHandler.threadPool.asyncPermits.max`.

[cols="2,1,3",options="header"]
|===
| Prometheus Metric Name | Type | Description
| `solr_client_request_async_permits{state="max"}` | gauge | Configured maximum number of outstanding concurrent async HTTP requests (controlled by `solr.solrj.http.jetty.async_requests.max`, default 1000).
| `solr_client_request_async_permits{state="available"}` | gauge | Number of async request permits currently available (i.e., not in use). When this approaches zero, new distributed requests will block waiting for a permit.
|===

== Core Level Metrics

These metrics are available only on a per-core basis.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ Due to changes in Lucene 9, that isn't possible any more.

== Solr 9.11

=== Max distributed requests now configurable

The internal HTTP client used for distributed shard sub-requests previously had a hard-coded limit of 1000 concurrent async requests per node.
In large clusters, a single query can fan out to hundreds of sub-requests, quickly exhausting this limit and causing requests to queue, potentially leading to stalls or timeouts.
This limit is now configurable via the system property `solr.solrj.http.jetty.async_requests.max`.

Current permit utilization can be monitored via the `solr_client_request_async_permits` metric (see xref:deployment-guide:metrics-reporting.adoc#http-client-registry[HTTP Client Registry]).

=== Docker

The `gosu` binary is no longer installed in the Solr Docker image. See https://github.com/tianon/gosu[gosu github page] for alternatives, such as `runuser`, `setpriv` or `chroot`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Phaser;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
Expand All @@ -60,6 +61,7 @@
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.EnvUtils;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.ObjectReleaseTracker;
Expand Down Expand Up @@ -108,6 +110,11 @@
*/
public class Http2SolrClient extends HttpSolrClientBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

/** System property to cap the maximum number of outstanding async HTTP requests. Default 1000. */
public static final String ASYNC_REQUESTS_MAX_SYSPROP =
"solr.solrj.http.jetty.async_requests.max";

public static final String REQ_PRINCIPAL_KEY = "solr-req-principal";
private static final String USER_AGENT =
"Solr[" + MethodHandles.lookup().lookupClass().getName() + "] " + SolrVersion.LATEST_STRING;
Expand Down Expand Up @@ -562,7 +569,17 @@ public void onHeaders(Response response) {
@Override
public void onFailure(Response response, Throwable failure) {
super.onFailure(response, failure);
future.completeExceptionally(new SolrServerException(failure.getMessage(), failure));
// Dispatch off the IO thread to avoid blocking semaphore.acquire() on retry.
// Fall back to IO thread if executor rejects (shutdown/overloaded).
SolrServerException ex = new SolrServerException(failure.getMessage(), failure);
try {
executor.execute(() -> future.completeExceptionally(ex));
} catch (RejectedExecutionException ree) {
log.warn(
"Failed to complete future exceptionally due to executor rejection, completing on IO thread.",
ree);
future.completeExceptionally(ex);
}
}
});

Expand Down Expand Up @@ -981,7 +998,21 @@ public void close() {
}

private static class AsyncTracker {
private static final int MAX_OUTSTANDING_REQUESTS = 1000;
/**
* Read per-instance so that tests can set the sysprop before constructing a client and have it
* take effect without relying on class-load ordering across test suites in the same JVM.
*/
private final int maxOutstandingRequests;

/**
* Request attribute key used to guard idempotency across both listeners. Set immediately after
* {@code phaser.register()} — before {@code available.acquire()} — so that {@code onComplete}
* can never fire between registration and attribute-set and leave a phaser party stranded.
* Jetty can re-fire {@code onRequestQueued} for the same exchange (e.g. after a GOAWAY retry);
* the attribute makes the second call a no-op. {@code onComplete} always fires exactly once and
* uses the attribute to call {@code arriveAndDeregister()} + {@code release()} exactly once.
*/
private static final String PERMIT_ACQUIRED_ATTR = "solr.async_tracker.permit_acquired";

// wait for async requests
private final Phaser phaser;
Expand All @@ -992,35 +1023,73 @@ private static class AsyncTracker {

AsyncTracker() {
// TODO: what about shared instances?
maxOutstandingRequests = EnvUtils.getPropertyAsInteger(ASYNC_REQUESTS_MAX_SYSPROP, 1000);
phaser = new Phaser(1);
available = new Semaphore(MAX_OUTSTANDING_REQUESTS, false);
available = new Semaphore(maxOutstandingRequests, false);
Comment thread
janhoy marked this conversation as resolved.
queuedListener =
request -> {
if (request.getAttributes().get(PERMIT_ACQUIRED_ATTR) != null) {
return;
}
phaser.register();
// Set the attribute before acquire() so onComplete can never race between
// phaser.register() and attribute-set, which would strand a phaser party forever.
request.attribute(PERMIT_ACQUIRED_ATTR, Boolean.TRUE);
try {
available.acquire();
} catch (InterruptedException ignored) {

} catch (InterruptedException e) {
// completeListener will call arriveAndDeregister() when onComplete fires.
Thread.currentThread().interrupt();
}
Comment thread
janhoy marked this conversation as resolved.
};
completeListener =
result -> {
phaser.arriveAndDeregister();
available.release();
if (result != null
&& result.getRequest().getAttributes().get(PERMIT_ACQUIRED_ATTR) != null) {
phaser.arriveAndDeregister();
available.release();
}
};
}

int getMaxRequestsQueuedPerDestination() {
// comfortably above max outstanding requests
return MAX_OUTSTANDING_REQUESTS * 3;
return maxOutstandingRequests * 3;
}

int maxPermits() {
return maxOutstandingRequests;
}

int availablePermits() {
return available.availablePermits();
}

public void waitForComplete() {
phaser.arriveAndAwaitAdvance();
// Use awaitAdvanceInterruptibly() instead of arriveAndAwaitAdvance() so that
// ExecutorUtil.shutdownNow() can unblock this during container shutdown.
int phase = phaser.arrive();
try {
phaser.awaitAdvanceInterruptibly(phase);
} catch (InterruptedException e) {
// Terminate phaser on interrupt so in-flight onComplete callbacks don't stall.
phaser.forceTermination();
Thread.currentThread().interrupt();
}
phaser.arriveAndDeregister();
}
}

/** Returns the configured maximum number of outstanding async requests. */
public int asyncTrackerMaxPermits() {
return asyncTracker.maxPermits();
}

/** Returns the number of currently available async-request permits. */
public int asyncTrackerAvailablePermits() {
return asyncTracker.availablePermits();
}

public static class Builder
extends HttpSolrClientBuilderBase<Http2SolrClient.Builder, Http2SolrClient> {

Expand Down
Loading