Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions gradle/testing/randomization/policies/solr-tests.policy
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ grant {
permission javax.management.MBeanServerPermission "findMBeanServer";
permission javax.management.MBeanServerPermission "releaseMBeanServer";
permission javax.management.MBeanTrustPermission "register";

// needed by crossdc
permission javax.security.auth.AuthPermission "getLoginConfiguration";
permission javax.security.auth.AuthPermission "setLoginConfiguration";

// needed by benchmark
permission java.security.SecurityPermission "insertProvider";

Expand Down Expand Up @@ -206,7 +206,7 @@ grant {

// additional permissions based on system properties set by /bin/solr
// NOTE: if the property is not set, the permission entry is ignored.
grant {
grant {
permission java.io.FilePermission "${solr.jetty.keystore}", "read,write,delete,readlink";
permission java.io.FilePermission "${solr.jetty.keystore}${/}-", "read,write,delete,readlink";

Expand Down Expand Up @@ -277,3 +277,7 @@ grant {
// Allow testing effects of customized or bug-fixed dependencies locally (also need to add mavenLocal() to build)
permission java.io.FilePermission "${user.home}${/}.m2${/}repository${/}-", "read";
};

grant {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accidental? But it's okay

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not accident. Needed it for JvmMetricsTest.

permission jdk.jfr.FlightRecorderPermission "accessFlightRecorder";
};
Comment on lines +281 to +283
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this because of JFR for JVM metrics from the OTEL library

12 changes: 8 additions & 4 deletions solr/core/src/java/org/apache/solr/core/SolrCore.java
Original file line number Diff line number Diff line change
Expand Up @@ -1315,7 +1315,7 @@ private UpdateHandler initUpdateHandler(UpdateHandler updateHandler) {
newUpdateHandler = createUpdateHandler(updateHandlerClass, updateHandler);
}
if (newUpdateHandler != null) {
coreMetricManager.registerMetricProducer("updateHandler", newUpdateHandler);
coreMetricManager.registerMetricProducer(newUpdateHandler, Attributes.empty());
}
infoRegistry.put("updateHandler", newUpdateHandler);
return newUpdateHandler;
Expand Down Expand Up @@ -3255,7 +3255,8 @@ public <T> T initPlugins(
if (registry != null) registry.put(info.name, o);
if (o instanceof SolrMetricProducer) {
coreMetricManager.registerMetricProducer(
type.getSimpleName() + "." + info.name, (SolrMetricProducer) o);
(SolrMetricProducer) o,
Attributes.of(PLUGIN_NAME_ATTR, type.getSimpleName() + "." + info.name));
}
if (o instanceof CircuitBreaker) {
if (o instanceof SolrCoreAware) {
Expand All @@ -3273,7 +3274,8 @@ public <T> T initPlugins(
public void initDefaultPlugin(Object plugin, Class<?> type) {
if (plugin instanceof SolrMetricProducer) {
coreMetricManager.registerMetricProducer(
type.getSimpleName() + ".default", (SolrMetricProducer) plugin);
(SolrMetricProducer) plugin,
Attributes.of(PLUGIN_NAME_ATTR, type.getSimpleName() + ".default"));
}
}

Expand Down Expand Up @@ -3583,7 +3585,9 @@ public void registerInfoBean(String name, SolrInfoBean solrInfoBean) {
infoRegistry.put(name, solrInfoBean);

if (solrInfoBean != null) {
coreMetricManager.registerMetricProducer(name, solrInfoBean);
Attributes attributes =
(name.startsWith("/")) ? Attributes.of(HANDLER_ATTR, name) : Attributes.empty();
coreMetricManager.registerMetricProducer(solrInfoBean, attributes);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@
*/
package org.apache.solr.metrics;

import static org.apache.solr.metrics.SolrMetricProducer.HANDLER_ATTR;

import com.codahale.metrics.MetricRegistry;
import io.opentelemetry.api.common.AttributeKey;
import io.opentelemetry.api.common.Attributes;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
Expand Down Expand Up @@ -57,7 +56,7 @@ public class SolrCoreMetricManager implements Closeable {
// rename
private final List<MetricProducerInfo> registeredProducers = new ArrayList<>();

private record MetricProducerInfo(SolrMetricProducer producer, String scope) {}
private record MetricProducerInfo(SolrMetricProducer producer, Attributes attributes) {}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did a little bit of cleanup on this because of this weird scope string.


/**
* Constructs a metric manager.
Expand Down Expand Up @@ -132,11 +131,10 @@ public void reregisterCoreMetrics() {

registeredProducers.forEach(
metricProducer -> {
var producerAttributes = core.getCoreAttributes().toBuilder();
if (metricProducer.scope().startsWith("/"))
producerAttributes.put(HANDLER_ATTR, metricProducer.scope);
metricProducer.producer.initializeMetrics(
solrMetricsContext, producerAttributes.build(), metricProducer.scope);
solrMetricsContext,
metricProducer.attributes.toBuilder().putAll(core.getCoreAttributes()).build(),
"");
});
}

Expand All @@ -145,29 +143,28 @@ public void reregisterCoreMetrics() {
* set of attributes for core level metrics. All metric producers are tracked for re-registering
* in the case of core swapping/renaming
*
* @param scope the scope of the metrics to be registered (e.g. `/admin/ping`)
* @param producer producer of metrics to be registered
* @param attributes
*/
public void registerMetricProducer(String scope, SolrMetricProducer producer) {
if (scope == null || producer == null) {
public void registerMetricProducer(SolrMetricProducer producer, Attributes attributes) {
if (attributes == null || producer == null) {
throw new IllegalArgumentException(
"registerMetricProducer() called with illegal arguments: "
+ "scope = "
+ scope
+ "attributes = "
+ attributes
+ ", producer = "
+ producer);
}

// Track this producer for potential re-initialization during core rename
registeredProducers.add(new MetricProducerInfo(producer, scope));
registeredProducers.add(new MetricProducerInfo(producer, attributes));

// TODO: We initialize metrics with attributes of the core. This happens again in
// reregisterCoreMetrics
// There is some possible improvement that can be done here to not have to duplicate code in
// reregisterCoreMetrics
var attributesBuilder = core.getCoreAttributes().toBuilder();
if (scope.startsWith("/")) attributesBuilder.put(HANDLER_ATTR, scope);
producer.initializeMetrics(solrMetricsContext, attributesBuilder.build(), scope);
producer.initializeMetrics(
solrMetricsContext, attributes.toBuilder().putAll(core.getCoreAttributes()).build(), "");
}

/** Return the registry used by this SolrCore. */
Expand All @@ -190,9 +187,6 @@ public void close() throws IOException {
}
metricManager.unregisterGauges(
solrMetricsContext.getRegistryName(), solrMetricsContext.getTag());

metricManager.removeRegistry(solrMetricsContext.getRegistryName());
registeredProducers.clear();
Comment on lines -194 to -195
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think I found the culprit for reload and metrics not working correctly. I had added to delete the registry here for OTEL metrics but Dropwizard didn't delete the registry but instead just unregistered gauges which isn't all metrics. So I removed this and tests all pass like normal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you saying that a core's metrics are mostly still accessible, even after a core is deleted? Technically you didn't, this code is in the context of "close"... but I'd hope that a high level command to remove/delete a core would do similarly for the underlying metrics.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metrics are not accessible on core delete or unload. Those metrics are completely gone and the registry goes with it. This change was for the reload scenario. Turns out, it didn't reset normal counters and meters in Dropwizard and never actually deletes the registry. Only the gauges it seems.

}

public SolrMetricsContext getSolrMetricsContext() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.metrics.otel.FilterablePrometheusMetricReader;
import org.apache.solr.metrics.otel.MetricExporterFactory;
import org.apache.solr.metrics.otel.NoopMetricExporter;
import org.apache.solr.metrics.otel.OtelUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -889,12 +888,11 @@ private static MetricRegistry getOrCreateRegistry(
*/
// NOCOMMIT: Remove this
public void removeRegistry(String registry) {
meterProviderAndReaders.computeIfPresent(
enforcePrefix(registry),
(key, meterAndReader) -> {
IOUtils.closeQuietly(meterAndReader.sdkMeterProvider());
return null;
});
String key = enforcePrefix(registry);
MeterProviderAndReaders mpr = meterProviderAndReaders.remove(key);
if (mpr != null) {
IOUtils.closeQuietly(mpr.sdkMeterProvider());
}
Comment on lines +891 to +895
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This here was difficult to track down. Tests were running into a dead lock because of this. What I had found is that when cores were being closed the Meter providers were going with it. The periodic metric reader was being registered even when the exporter was a Noop. When the periodic metric reader is closed, it triggers one last metric collection for the exporter calling the observable instruments callbacks holding the lock for that concurrent hashmaps bucket. One of the callbacks was getting index size in SolrCore which had a searcher lock. At the same time, the Solr core closing looked to hold that same searcher lock creating a deadlock. Basically from what I can tell:

Thread A: holds ConcurrentHashMaps bucket lock → waiting for searcherLock (inside the callback).
Thread B: holds searcherLock from the Solr close call → waiting for CHM bin lock.

Deadlock!

So changed this to just close the provider without the computeIfPresent.

}

/** Close all meter providers and their associated metric readers. */
Expand Down Expand Up @@ -1769,7 +1767,7 @@ public MetricExporter getMetricExporter() {
}

private MetricExporter loadMetricExporter(SolrResourceLoader loader) {
if (!OTLP_EXPORTER_ENABLED) return new NoopMetricExporter();
if (!OTLP_EXPORTER_ENABLED) return null;
try {
MetricExporterFactory exporterFactory =
loader.newInstance(
Expand All @@ -1778,7 +1776,7 @@ private MetricExporter loadMetricExporter(SolrResourceLoader loader) {
} catch (SolrException e) {
log.error(
"Could not load OTLP exporter. Check that the Open Telemetry module is enabled.", e);
return new NoopMetricExporter();
return null;
}
}

Expand Down

This file was deleted.

3 changes: 1 addition & 2 deletions solr/core/src/java/org/apache/solr/update/PeerSync.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,7 @@ public PeerSync(
shardHandler = shardHandlerFactory.getShardHandler();
this.updater = new Updater(msg(), core);

core.getCoreMetricManager()
.registerMetricProducer(SolrInfoBean.Category.REPLICATION.toString(), this);
core.getCoreMetricManager().registerMetricProducer(this, Attributes.empty());
}

public static final String METRIC_SCOPE = "peerSync";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ public PeerSyncWithLeader(SolrCore core, String leaderUrl, int nUpdates) {

this.updater = new PeerSync.Updater(msg(), core);

core.getCoreMetricManager()
.registerMetricProducer(SolrInfoBean.Category.REPLICATION.toString(), this);
core.getCoreMetricManager().registerMetricProducer(this, Attributes.empty());
}

public static final String METRIC_SCOPE = "peerSync";
Expand Down
5 changes: 2 additions & 3 deletions solr/core/src/java/org/apache/solr/update/UpdateLog.java
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,7 @@ public void init(UpdateHandler uhandler, SolrCore core) {
getTlogDir(),
id);
}
core.getCoreMetricManager()
.registerMetricProducer(SolrInfoBean.Category.TLOG.toString(), this);
core.getCoreMetricManager().registerMetricProducer(this, Attributes.empty());

String reResolved = resolveDataDir(core);
if (dataDir == null || !dataDir.equals(reResolved)) {
Expand Down Expand Up @@ -500,7 +499,7 @@ public void init(UpdateHandler uhandler, SolrCore core) {
trackDeleteByQuery(q, version);
}
}
core.getCoreMetricManager().registerMetricProducer(SolrInfoBean.Category.TLOG.toString(), this);
core.getCoreMetricManager().registerMetricProducer(this, Attributes.empty());
}

protected final void maybeClearLog(SolrCore core) {
Expand Down
16 changes: 8 additions & 8 deletions solr/core/src/test/org/apache/solr/core/RequestHandlersTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
*/
package org.apache.solr.core;

import com.codahale.metrics.Gauge;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.util.SolrMetricTestUtils;
import org.junit.BeforeClass;
Expand All @@ -32,12 +30,14 @@ public static void beforeClass() throws Exception {

@Test
public void testInitCount() {
String registry = h.getCore().getCoreMetricManager().getRegistryName();
SolrMetricManager manager = h.getCoreContainer().getMetricManager();
@SuppressWarnings({"unchecked"})
Gauge<Number> g =
(Gauge<Number>) manager.registry(registry).getMetrics().get("QUERY./mock.initCount");
assertEquals("Incorrect init count", 1, g.getValue().intValue());
var datapoint =
SolrMetricTestUtils.getGaugeDatapoint(
h.getCore(),
"mock_request",
SolrMetricTestUtils.newStandaloneLabelsBuilder(h.getCore())
.label("handler", "/mock")
.build());
assertEquals(1.0, datapoint.getValue(), 0.0);
}

@Test
Expand Down
Loading