Skip to content

Commit

Permalink
ZOOKEEPER-3019: add metric for slow fsyncs count
Browse files Browse the repository at this point in the history
Backported fsync metric count from master.

Author: Norbert Kalmar <nkalmar@yahoo.com>

Reviewers: Andor Molnar <andor@apache.org>

Closes #510 from nkalmar/branch-3.4 and squashes the following commits:

0ec9213 [Norbert Kalmar] ZOOKEEPER-3019 remove redundant log
63d3cb9 [Norbert Kalmar] Merge branch 'branch-3.4' of github.com:apache/zookeeper into branch-3.4
deb2543 [Norbert Kalmar] Merge branch 'branch-3.4' of github.com:apache/zookeeper into branch-3.4
336a83b [Norbert Kalmar] ZOOKEEPER-3019 add metric for slow fsyncs count
  • Loading branch information
nkalmar authored and anmolnar committed Jun 8, 2018
1 parent e2ba86e commit 1162f6d
Show file tree
Hide file tree
Showing 22 changed files with 284 additions and 2 deletions.
3 changes: 3 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,11 @@ xmlns:cs="antlib:com.puppycrawl.tools.checkstyle">

<property name="dependency-check-ant.version" value="3.2.1"/>

<property name="hamcrest.version" value="1.3"/>

<property name="clover.version" value="4.2.1" />


<!-- ====================================================== -->
<!-- Project properties -->
<!-- ====================================================== -->
Expand Down
2 changes: 2 additions & 0 deletions ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@
<exclude org="org.slf4j" module="slf4j-api"/>
</dependency>

<dependency org="org.hamcrest" name="hamcrest-all" rev="${hamcrest.version}" conf="test->default" />

<dependency org="org.openclover" name="clover" rev="${clover.version}" conf="clover->default"/>

<conflict manager="strict"/>
Expand Down
1 change: 1 addition & 0 deletions src/contrib/monitoring/README
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ zk_ephemerals_count 0
zk_approximate_data_size 41
zk_open_file_descriptor_count 20
zk_max_file_descriptor_count 1024
zk_fsync_threshold_exceed_count 0

Python 2.6 (maybe it works on previous version but it's not tested yet).

Expand Down
25 changes: 25 additions & 0 deletions src/contrib/monitoring/check_zookeeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,31 @@ def _parse_stat(self, data):
result['zk_znode_count'] = int(m.group(1))
continue

m = re.match('Watch count: (\d+)', line)
if m is not None:
result['zk_watch_count'] = int(m.group(1))
continue

m = re.match('Ephemerals count: (\d+)', line)
if m is not None:
result['zk_ephemerals_count'] = int(m.group(1))
continue

m = re.match('Approximate data size: (\d+)', line)
if m is not None:
result['zk_approximate_data_size'] = int(m.group(1))
continue

m = re.match('Open file descriptor count: (\d+)', line)
if m is not None:
result['zk_open_file_descriptor_count'] = int(m.group(1))
continue

m = re.match('Max file descriptor count: (\d+)', line)
if m is not None:
result['zk_max_file_descriptor_count'] = int(m.group(1))
continue

m = re.match('Zxid: (0x[0-9a-fA-F]+)', line)
if m is not None:
result['zk_zxid'] = m.group(1)
Expand Down
1 change: 1 addition & 0 deletions src/contrib/monitoring/ganglia/zookeeper.pyconf
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,6 @@ collection_group {
metric { name = "zk_followers" }
metric { name = "zk_synced_followers" }
metric { name = "zk_pending_syncs" }
metric { name = "zk_fsync_threshold_exceed_count" }
}

3 changes: 2 additions & 1 deletion src/contrib/monitoring/ganglia/zookeeper_ganglia.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ def metric_init(params=None):
'zk_max_file_descriptor_count': {'units': 'descriptors'},
'zk_followers': {'units': 'nodes'},
'zk_synced_followers': {'units': 'nodes'},
'zk_pending_syncs': {'units': 'syncs'}
'zk_pending_syncs': {'units': 'syncs'},
'zk_fsync_threshold_exceed_count': {'units': 'fsyncexceed'}
}
metric_handler.descriptors = {}
for name, updates in metrics.iteritems():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,7 @@ zk_synced_followers 4 - only exposed by the Leader
zk_pending_syncs 0 - only exposed by the Leader
zk_open_file_descriptor_count 23 - only available on Unix platforms
zk_max_file_descriptor_count 1024 - only available on Unix platforms
zk_fsync_threshold_exceed_count 0
</programlisting>

<para>The output is compatible with java properties format and the content
Expand Down
2 changes: 2 additions & 0 deletions src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,8 @@ public void commandRun() {
print("max_file_descriptor_count", osMbean.getMaxFileDescriptorCount());
}

print("fsync_threshold_exceed_count", stats.getFsyncThresholdExceedCount());

if(stats.getServerState().equals("leader")) {
Leader leader = ((LeaderZooKeeperServer)zkServer).getLeader();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,9 @@ public void commandRun() {
print("open_file_descriptor_count", osMbean.getOpenFileDescriptorCount());
print("max_file_descriptor_count", osMbean.getMaxFileDescriptorCount());
}


print("fsync_threshold_exceed_count", stats.getFsyncThresholdExceedCount());

if(stats.getServerState().equals("leader")) {
Leader leader = ((LeaderZooKeeperServer)zkServer).getLeader();

Expand Down
16 changes: 16 additions & 0 deletions src/java/main/org/apache/zookeeper/server/ServerStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

import org.apache.zookeeper.common.Time;

import java.util.concurrent.atomic.AtomicLong;

/**
* Basic Server Statistics
*/
Expand All @@ -32,6 +34,7 @@ public class ServerStats {
private long minLatency = Long.MAX_VALUE;
private long totalLatency = 0;
private long count = 0;
private AtomicLong fsyncThresholdExceedCount = new AtomicLong(0);

private final Provider provider;

Expand Down Expand Up @@ -103,6 +106,19 @@ public String toString(){
sb.append("Mode: " + getServerState() + "\n");
return sb.toString();
}

public long getFsyncThresholdExceedCount() {
return fsyncThresholdExceedCount.get();
}

public void incrementFsyncThresholdExceedCount() {
fsyncThresholdExceedCount.incrementAndGet();
}

public void resetFsyncThresholdExceedCount() {
fsyncThresholdExceedCount.set(0);
}

// mutators
synchronized void updateLatency(long requestCreateTime) {
long latency = Time.currentElapsedTime() - requestCreateTime;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ public ZooKeeperServer(FileTxnSnapLog txnLogFactory, int tickTime,
DataTreeBuilder treeBuilder, ZKDatabase zkDb) {
serverStats = new ServerStats(this);
this.txnLogFactory = txnLogFactory;
this.txnLogFactory.setServerStats(this.serverStats);
this.zkDb = zkDb;
this.tickTime = tickTime;
this.minSessionTimeout = minSessionTimeout;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ public long getPacketsReceived() {
public long getPacketsSent() {
return zks.serverStats().getPacketsSent();
}

public long getFsyncThresholdExceedCount() {
return zks.serverStats().getFsyncThresholdExceedCount();
}

public void resetLatency() {
zks.serverStats().resetLatency();
Expand All @@ -136,10 +140,15 @@ public void resetMaxLatency() {
zks.serverStats().resetMaxLatency();
}

public void resetFsyncThresholdExceedCount() {
zks.serverStats().resetFsyncThresholdExceedCount();
}

public void resetStatistics() {
ServerStats serverStats = zks.serverStats();
serverStats.resetRequestCounters();
serverStats.resetLatency();
serverStats.resetFsyncThresholdExceedCount();
}

public long getNumAliveConnections() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ public interface ZooKeeperServerMXBean {
* @return number of packets sent so far
*/
public long getPacketsSent();

/**
* @return number of fsync threshold exceeds so far
*/
public long getFsyncThresholdExceedCount();
/**
* @return number of outstanding requests.
*/
Expand Down Expand Up @@ -103,6 +108,11 @@ public interface ZooKeeperServerMXBean {
* Reset max latency statistics only.
*/
public void resetMaxLatency();

/**
* Reset Fsync Threshold Exceed Count statistics only.
*/
public void resetFsyncThresholdExceedCount();
/**
* @return number of alive client connections
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ public void runFromConfig(ServerConfig config) throws IOException {

txnLog = new FileTxnSnapLog(new File(config.dataLogDir), new File(
config.dataDir));
txnLog.setServerStats(zkServer.serverStats());
zkServer.setTxnLogFactory(txnLog);
zkServer.setTickTime(config.tickTime);
zkServer.setMinSessionTimeout(config.minSessionTimeout);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.jute.InputArchive;
import org.apache.jute.OutputArchive;
import org.apache.jute.Record;
import org.apache.zookeeper.server.ServerStats;
import org.apache.zookeeper.server.util.SerializeUtils;
import org.apache.zookeeper.txn.TxnHeader;
import org.slf4j.Logger;
Expand Down Expand Up @@ -124,6 +125,8 @@ public class FileTxnLog implements TxnLog {
File logFileWrite = null;
private FilePadding filePadding = new FilePadding();

private ServerStats serverStats;

/**
* constructor for FileTxnLog. Take the directory
* where the txnlogs are stored
Expand All @@ -142,6 +145,15 @@ public static void setPreallocSize(long size) {
FilePadding.setPreallocSize(size);
}

/**
* Setter for ServerStats to monitor fsync threshold exceed
* @param serverStats used to update fsyncThresholdExceedCount
*/
@Override
public void setServerStats(ServerStats serverStats) {
this.serverStats = serverStats;
}

/**
* creates a checksum algorithm to be used
* @return the checksum used for this txnlog
Expand Down Expand Up @@ -320,6 +332,9 @@ public synchronized void commit() throws IOException {
long syncElapsedMS =
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startSyncNS);
if (syncElapsedMS > fsyncWarningThresholdMS) {
if(serverStats != null) {
serverStats.incrementFsyncThresholdExceedCount();
}
LOG.warn("fsync-ing the write ahead log in "
+ Thread.currentThread().getName()
+ " took " + syncElapsedMS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.zookeeper.server.DataTree;
import org.apache.zookeeper.server.DataTree.ProcessTxnResult;
import org.apache.zookeeper.server.Request;
import org.apache.zookeeper.server.ServerStats;
import org.apache.zookeeper.server.ZooTrace;
import org.apache.zookeeper.server.persistence.TxnLog.TxnIterator;
import org.apache.zookeeper.txn.CreateSessionTxn;
Expand Down Expand Up @@ -112,6 +113,10 @@ public FileTxnSnapLog(File dataDir, File snapDir) throws IOException {
snapLog = new FileSnap(this.snapDir);
}

public void setServerStats(ServerStats serverStats) {
txnLog.setServerStats(serverStats);
}

private void checkLogDir() throws LogDirContentCheckException {
File[] files = this.dataDir.listFiles(new FilenameFilter() {
@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,20 @@
import java.io.IOException;

import org.apache.jute.Record;
import org.apache.zookeeper.server.ServerStats;
import org.apache.zookeeper.txn.TxnHeader;

/**
* Interface for reading transaction logs.
*
*/
public interface TxnLog {

/**
+ * Setter for ServerStats to monitor fsync threshold exceed
+ * @param serverStats used to update fsyncThresholdExceedCount
+ */
void setServerStats(ServerStats serverStats);

/**
* roll the current
Expand Down
5 changes: 5 additions & 0 deletions src/java/test/config/findbugsExcludeFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@
<Field name="lastProposed"/>
<Bug code="IS"/>
</Match>
<Match>
<Class name="org.apache.zookeeper.server.persistence.FileTxnLog"/>
<Field name="serverStats"/>
<Bug code="IS"/>
</Match>
<Match>
<Class name="org.apache.zookeeper.server.quorum.LearnerSessionTracker"/>
<Bug code="UrF"/>
Expand Down

0 comments on commit 1162f6d

Please sign in to comment.