diff --git a/changelog/unreleased/SOLR-18174-prevent-double-registration.yml b/changelog/unreleased/SOLR-18174-prevent-double-registration.yml new file mode 100644 index 000000000000..d31f038ea1b2 --- /dev/null +++ b/changelog/unreleased/SOLR-18174-prevent-double-registration.yml @@ -0,0 +1,8 @@ +title: Fix semaphore permit leaks in Http2SolrClient's AsyncTracker. Avoid IO-thread deadlock on connection failure retries. Add a new metric gauge solr_client_request_async_permits +type: fixed +authors: + - name: Jan Høydahl + url: https://home.apache.org/phonebook.html?uid=janhoy +links: + - name: SOLR-18174 + url: https://issues.apache.org/jira/browse/SOLR-18174 diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java index 9e112b54c52f..9c78d51d4cc6 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java +++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java @@ -445,5 +445,19 @@ public void initializeMetrics(SolrMetricsContext parentContext, String scope) { null, solrMetricsContext.getMetricRegistry(), SolrMetricManager.mkName("httpShardExecutor", expandedScope, "threadPool")); + if (defaultClient != null) { + solrMetricsContext.gauge( + defaultClient::asyncTrackerAvailablePermits, + true, + "asyncPermits.available", + expandedScope, + "threadPool"); + solrMetricsContext.gauge( + defaultClient::asyncTrackerMaxPermits, + true, + "asyncPermits.max", + expandedScope, + "threadPool"); + } } } diff --git a/solr/core/src/test/org/apache/solr/handler/component/AsyncTrackerSemaphoreLeakTest.java b/solr/core/src/test/org/apache/solr/handler/component/AsyncTrackerSemaphoreLeakTest.java new file mode 100644 index 000000000000..11e34dd3257f --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/component/AsyncTrackerSemaphoreLeakTest.java @@ -0,0 +1,407 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.net.ServerSocket; +import java.net.Socket; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Phaser; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.lucene.util.SuppressForbidden; +import org.apache.solr.client.solrj.impl.Http2SolrClient; +import org.apache.solr.client.solrj.impl.LBHttp2SolrClient; +import org.apache.solr.client.solrj.impl.LBSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.params.SolrParams; +import org.eclipse.jetty.client.api.Request; +import org.eclipse.jetty.client.api.Response; +import org.eclipse.jetty.client.api.Result; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.mockito.ArgumentMatchers; +import org.mockito.Mockito; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Tests for two semaphore-permit leak bugs in {@link Http2SolrClient}'s {@code AsyncTracker} that + * cause distributed queries to hang permanently. + * + *
Jetty HTTP/2 can re-queue the same exchange after a GOAWAY/connection race, firing {@code + * onRequestQueued} twice for one logical request. Because {@code onComplete} fires only once, one + * permit is permanently consumed per occurrence, gradually draining the semaphore over hours or + * days until Pattern B triggers. + * + *
When a connection-level failure causes {@link + * org.apache.solr.client.solrj.impl.LBHttp2SolrClient} to retry synchronously inside a {@code + * whenComplete} callback on the Jetty IO selector thread, the retry calls {@code acquire()} on that + * same IO thread before the original request's {@code onComplete} can call {@code release()}. No + * permits are permanently lost — the deadlock simply requires two permits to be available + * simultaneously — but if the semaphore is at zero, {@code acquire()} blocks the IO thread + * permanently and distributed queries hang forever. + */ +public class AsyncTrackerSemaphoreLeakTest extends SolrCloudTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final String COLLECTION = "semaphore_leak_test"; + + /** Reduced semaphore size so we can observe the drain without needing thousands of requests. */ + private static final int MAX_PERMITS = 40; + + /** + * Number of concurrent requests. Set equal to MAX_PERMITS so that all permits are exhausted + * before any retry can acquire, triggering the IO-thread deadlock. + */ + private static final int NUM_RETRY_REQUESTS = MAX_PERMITS; + + @BeforeClass + public static void setupCluster() throws Exception { + // Reduce the semaphore size so we can observe drain with few requests. + // This property is read when Http2SolrClient is constructed, so it must + // be set BEFORE the cluster (and its HttpShardHandlerFactory) starts up. + System.setProperty(Http2SolrClient.ASYNC_REQUESTS_MAX_SYSPROP, String.valueOf(MAX_PERMITS)); + + configureCluster(1).addConfig("conf", configset("cloud-dynamic")).configure(); + + CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 1) + .process(cluster.getSolrClient()); + + waitForState("Expected 2 active shards with 1 replica each", COLLECTION, clusterShape(2, 2)); + } + + @AfterClass + public static void cleanup() { + System.clearProperty(Http2SolrClient.ASYNC_REQUESTS_MAX_SYSPROP); + } + + /** + * Demonstrates the permanent IO-thread deadlock (Pattern B) caused by {@link + * org.apache.solr.client.solrj.impl.LBHttp2SolrClient} retrying a request synchronously inside a + * {@link CompletableFuture#whenComplete} callback that runs on the Jetty IO selector thread. + * + *
This test passes with the {@code failureDispatchExecutor} fix in this branch. Without
+ * the fix, the IO thread would block forever in {@code semaphore.acquire()} and this test would
+ * time out.
+ */
+ @Test
+ @SuppressForbidden(
+ reason =
+ "Reflection needed to access Http2SolrClient's package-private getHttpClient() to force-stop it during timeout recovery")
+ public void testSemaphoreLeakOnLBRetry() throws Exception {
+ // Dedicated client so that permanently deadlocked IO threads don't affect the cluster's client.
+ Http2SolrClient testClient =
+ new Http2SolrClient.Builder()
+ .withConnectionTimeout(5, TimeUnit.SECONDS)
+ .withIdleTimeout(30, TimeUnit.SECONDS)
+ .useHttp1_1(true) // HTTP/1.1: every request gets its own TCP connection
+ .build();
+
+ String realBaseUrl =
+ cluster.getJettySolrRunners().get(0).getBaseUrl().toString() + "/" + COLLECTION;
+
+ List Rather than setting up a real HTTP/2 server, this test uses reflection to invoke {@code
+ * AsyncTracker.queuedListener} twice and {@code AsyncTracker.completeListener} once for the same
+ * {@code Request} object. Without the guard the semaphore count drops by one; with the guard the
+ * second queued call is a no-op and the count is unchanged.
+ */
+ @Test
+ @SuppressForbidden(
+ reason =
+ "Reflection needed to access AsyncTracker's private fields for white-box testing without exposing them in the production API")
+ public void testPermitLeakOnHttp2GoAwayDoubleQueuedListener() throws Exception {
+ assumeWorkingMockito();
+
+ Http2SolrClient testClient =
+ new Http2SolrClient.Builder()
+ .withConnectionTimeout(5, TimeUnit.SECONDS)
+ .withIdleTimeout(30, TimeUnit.SECONDS)
+ // HTTP/2 is the default transport where this GOAWAY race occurs.
+ .build();
+
+ // Capture asyncTracker and its class for reflection-based listener access and cleanup.
+ Field asyncTrackerField = Http2SolrClient.class.getDeclaredField("asyncTracker");
+ asyncTrackerField.setAccessible(true);
+ Object asyncTracker = asyncTrackerField.get(testClient);
+ Class> asyncTrackerClass = asyncTracker.getClass();
+
+ try {
+ int maxPermits = testClient.asyncTrackerMaxPermits();
+ assertEquals(
+ "All permits available before test",
+ maxPermits,
+ testClient.asyncTrackerAvailablePermits());
+
+ // Access the raw listeners via reflection to simulate Jetty's internal double-fire.
+ Field queuedListenerField = asyncTrackerClass.getDeclaredField("queuedListener");
+ queuedListenerField.setAccessible(true);
+ Request.QueuedListener queuedListener =
+ (Request.QueuedListener) queuedListenerField.get(asyncTracker);
+
+ Field completeListenerField = asyncTrackerClass.getDeclaredField("completeListener");
+ completeListenerField.setAccessible(true);
+ Response.CompleteListener completeListener =
+ (Response.CompleteListener) completeListenerField.get(asyncTracker);
+
+ // Fake Request that supports the attribute get/set used by the idempotency guard.
+ Map Implements {@link AutoCloseable} so that the server socket and any open connections are
+ * always cleaned up when used in a try-with-resources block, even if the test fails or throws.
+ */
+ private static class FakeTcpServer implements AutoCloseable {
+ private final ServerSocket serverSocket;
+ private final List