From 00045251755c04696b430a5bb723654b4529efc0 Mon Sep 17 00:00:00 2001 From: Dong Lin Date: Sun, 2 Apr 2017 17:46:34 -0700 Subject: [PATCH 01/11] KAFKA-4763; Handle disk failure for JBOD (KIP-112) --- .../apache/kafka/common/PartitionInfo.java | 21 +- .../common/errors/KafkaStorageException.java | 33 ++ .../errors/UnknownRetriableException.java | 40 ++ .../apache/kafka/common/protocol/Errors.java | 22 +- .../kafka/common/protocol/Protocol.java | 123 +++++- .../kafka/common/record/FileRecords.java | 7 + .../kafka/common/requests/FetchResponse.java | 5 +- .../common/requests/LeaderAndIsrRequest.java | 12 +- .../common/requests/MetadataResponse.java | 54 ++- .../kafka/common/requests/PartitionState.java | 8 +- .../kafka/common/requests/ProduceRequest.java | 4 + .../common/requests/ProduceResponse.java | 5 +- .../requests/UpdateMetadataRequest.java | 58 ++- .../consumer/internals/FetcherTest.java | 3 +- .../kafka/common/PartitionInfoTest.java | 15 +- .../common/requests/RequestResponseTest.java | 22 +- .../main/scala/kafka/admin/AdminUtils.scala | 2 +- .../src/main/scala/kafka/api/ApiVersion.scala | 12 +- .../main/scala/kafka/api/LeaderAndIsr.scala | 17 +- .../main/scala/kafka/cluster/Partition.scala | 26 +- .../kafka/common/KafkaStorageException.scala | 2 +- .../consumer/ConsumerFetcherThread.scala | 6 +- .../controller/ControllerChannelManager.scala | 52 ++- .../kafka/controller/ControllerState.scala | 11 +- .../kafka/controller/KafkaController.scala | 174 ++++++-- .../controller/PartitionLeaderSelector.scala | 9 +- .../controller/PartitionStateMachine.scala | 8 +- .../controller/ReplicaStateMachine.scala | 4 +- .../controller/TopicDeletionManager.scala | 2 +- .../group/GroupMetadataManager.scala | 5 +- .../transaction/TransactionCoordinator.scala | 2 +- .../main/scala/kafka/log/AbstractIndex.scala | 6 + core/src/main/scala/kafka/log/Log.scala | 10 + .../src/main/scala/kafka/log/LogCleaner.scala | 113 +++--- .../scala/kafka/log/LogCleanerManager.scala | 67 ++- .../src/main/scala/kafka/log/LogManager.scala | 383 ++++++++++++------ .../src/main/scala/kafka/log/LogSegment.scala | 10 + .../kafka/server/AbstractFetcherThread.scala | 26 +- .../server/BrokerMetadataCheckpoint.scala | 3 +- .../main/scala/kafka/server/KafkaApis.scala | 59 ++- .../main/scala/kafka/server/KafkaServer.scala | 42 +- .../kafka/server/LogDirFailureChannel.scala | 50 +++ .../scala/kafka/server/MetadataCache.scala | 20 +- .../kafka/server/ReplicaFetcherThread.scala | 23 +- .../scala/kafka/server/ReplicaManager.scala | 241 ++++++++--- .../server/checkpoints/CheckpointFile.scala | 2 +- .../main/scala/kafka/utils/LogDirUtils.scala | 63 +++ core/src/main/scala/kafka/utils/ZkUtils.scala | 7 +- .../kafka/api/AuthorizerIntegrationTest.scala | 8 +- .../kafka/api/IntegrationTestHarness.scala | 5 +- .../kafka/api/LogDirFailureTest.scala | 116 ++++++ .../kafka/api/TransactionsTest.scala | 4 +- .../ReplicaFetcherThreadFatalErrorTest.scala | 2 +- .../AbstractLogCleanerIntegrationTest.scala | 5 +- .../kafka/log/LogCleanerIntegrationTest.scala | 2 +- .../kafka/log/LogCleanerManagerTest.scala | 4 +- .../scala/unit/kafka/log/LogManagerTest.scala | 32 +- .../server/AbstractFetcherThreadTest.scala | 6 +- .../server/HighwatermarkPersistenceTest.scala | 27 +- .../unit/kafka/server/ISRExpirationTest.scala | 14 +- .../kafka/server/LeaderElectionTest.scala | 5 +- .../unit/kafka/server/LogOffsetTest.scala | 4 +- .../unit/kafka/server/MetadataCacheTest.scala | 18 +- .../server/ReplicaManagerQuotasTest.scala | 4 +- .../kafka/server/ReplicaManagerTest.scala | 71 ++-- .../unit/kafka/server/RequestQuotaTest.scala | 7 +- .../unit/kafka/server/SimpleFetchTest.scala | 5 +- .../epoch/OffsetsForLeaderEpochTest.scala | 26 +- .../scala/unit/kafka/utils/TestUtils.scala | 22 +- docs/upgrade.html | 51 ++- .../utils/IntegrationTestUtils.java | 8 +- .../internals/InternalTopicManagerTest.java | 2 +- tests/kafkatest/services/kafka/config.py | 2 +- .../services/kafka/config_property.py | 4 + tests/kafkatest/services/kafka/kafka.py | 61 ++- .../tests/core/log_dir_failure_test.py | 177 ++++++++ 76 files changed, 1955 insertions(+), 626 deletions(-) create mode 100644 clients/src/main/java/org/apache/kafka/common/errors/KafkaStorageException.java create mode 100644 clients/src/main/java/org/apache/kafka/common/errors/UnknownRetriableException.java create mode 100644 core/src/main/scala/kafka/server/LogDirFailureChannel.scala create mode 100644 core/src/main/scala/kafka/utils/LogDirUtils.scala create mode 100644 core/src/test/scala/integration/kafka/api/LogDirFailureTest.scala create mode 100644 tests/kafkatest/tests/core/log_dir_failure_test.py diff --git a/clients/src/main/java/org/apache/kafka/common/PartitionInfo.java b/clients/src/main/java/org/apache/kafka/common/PartitionInfo.java index b35111696373c..0d979d7fcccba 100644 --- a/clients/src/main/java/org/apache/kafka/common/PartitionInfo.java +++ b/clients/src/main/java/org/apache/kafka/common/PartitionInfo.java @@ -17,7 +17,7 @@ package org.apache.kafka.common; /** - * Information about a topic-partition. + * Information about a topic-partition. This is used to describe MetadataPartitionInfo. */ public class PartitionInfo { @@ -26,13 +26,20 @@ public class PartitionInfo { private final Node leader; private final Node[] replicas; private final Node[] inSyncReplicas; + private final Node[] offlineReplicas; + // Used only by tests public PartitionInfo(String topic, int partition, Node leader, Node[] replicas, Node[] inSyncReplicas) { + this(topic, partition, leader, replicas, inSyncReplicas, new Node[0]); + } + + public PartitionInfo(String topic, int partition, Node leader, Node[] replicas, Node[] inSyncReplicas, Node[] offlineReplicas) { this.topic = topic; this.partition = partition; this.leader = leader; this.replicas = replicas; this.inSyncReplicas = inSyncReplicas; + this.offlineReplicas = offlineReplicas; } /** @@ -71,14 +78,22 @@ public Node[] inSyncReplicas() { return inSyncReplicas; } + /** + * The subset of the replicas that are offline + */ + public Node[] offlineReplicas() { + return offlineReplicas; + } + @Override public String toString() { - return String.format("Partition(topic = %s, partition = %d, leader = %s, replicas = %s, isr = %s)", + return String.format("Partition(topic = %s, partition = %d, leader = %s, replicas = %s, isr = %s, offlineReplicas = %s)", topic, partition, leader == null ? "none" : leader.idString(), formatNodeIds(replicas), - formatNodeIds(inSyncReplicas)); + formatNodeIds(inSyncReplicas), + formatNodeIds(offlineReplicas)); } /* Extract the node ids from each item in the array and format for display */ diff --git a/clients/src/main/java/org/apache/kafka/common/errors/KafkaStorageException.java b/clients/src/main/java/org/apache/kafka/common/errors/KafkaStorageException.java new file mode 100644 index 0000000000000..fdf24ce32fc40 --- /dev/null +++ b/clients/src/main/java/org/apache/kafka/common/errors/KafkaStorageException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.common.errors; + +/** + * Miscellaneous disk-related IOException occurred when handling a request. + */ +public class KafkaStorageException extends RetriableException { + + private static final long serialVersionUID = 1L; + + public KafkaStorageException() { + super(); + } + + public KafkaStorageException(String message) { + super(message); + } +} diff --git a/clients/src/main/java/org/apache/kafka/common/errors/UnknownRetriableException.java b/clients/src/main/java/org/apache/kafka/common/errors/UnknownRetriableException.java new file mode 100644 index 0000000000000..37a7e491c1c92 --- /dev/null +++ b/clients/src/main/java/org/apache/kafka/common/errors/UnknownRetriableException.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.common.errors; + +/** + * An error code on the server for which the client doesn't have a corresponding error. + */ +public class UnknownRetriableException extends RetriableException { + + private static final long serialVersionUID = 1L; + + public UnknownRetriableException() { + } + + public UnknownRetriableException(String message) { + super(message); + } + + public UnknownRetriableException(Throwable cause) { + super(cause); + } + + public UnknownRetriableException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/clients/src/main/java/org/apache/kafka/common/protocol/Errors.java b/clients/src/main/java/org/apache/kafka/common/protocol/Errors.java index ae8d16117afd1..70acbcbe7b8a4 100644 --- a/clients/src/main/java/org/apache/kafka/common/protocol/Errors.java +++ b/clients/src/main/java/org/apache/kafka/common/protocol/Errors.java @@ -44,6 +44,7 @@ import org.apache.kafka.common.errors.InvalidTopicException; import org.apache.kafka.common.errors.InvalidTxnStateException; import org.apache.kafka.common.errors.InvalidTxnTimeoutException; +import org.apache.kafka.common.errors.KafkaStorageException; import org.apache.kafka.common.errors.LeaderNotAvailableException; import org.apache.kafka.common.errors.NetworkException; import org.apache.kafka.common.errors.NotControllerException; @@ -69,6 +70,7 @@ import org.apache.kafka.common.errors.TransactionalIdAuthorizationException; import org.apache.kafka.common.errors.TransactionCoordinatorFencedException; import org.apache.kafka.common.errors.UnknownMemberIdException; +import org.apache.kafka.common.errors.UnknownRetriableException; import org.apache.kafka.common.errors.UnknownServerException; import org.apache.kafka.common.errors.UnknownTopicOrPartitionException; import org.apache.kafka.common.errors.UnsupportedForMessageFormatException; @@ -87,6 +89,13 @@ * Do not add exceptions that occur only on the client or only on the server here. */ public enum Errors { + UNKNOWN_RETRIABLE(-2, "The client received an unexpected error code when processing the response", + new ApiExceptionBuilder() { + @Override + public ApiException build(String message) { + return new UnknownRetriableException(message); + } + }), UNKNOWN(-1, "The server experienced an unexpected error when processing the request", new ApiExceptionBuilder() { @Override @@ -495,7 +504,14 @@ public ApiException build(String message) { public ApiException build(String message) { return new OperationNotAttemptedException(message); } - }); + }), + KAFKA_STORAGE_ERROR(56, "Disk error when trying to access log file on the disk.", + new ApiExceptionBuilder() { + @Override + public ApiException build(String message) { + return new KafkaStorageException(message); + } + }); private interface ApiExceptionBuilder { ApiException build(String message); @@ -587,8 +603,8 @@ public static Errors forCode(short code) { if (error != null) { return error; } else { - log.warn("Unexpected error code: {}.", code); - return UNKNOWN; + log.warn("Unknown error code: {}.", code); + return UNKNOWN_RETRIABLE; } } diff --git a/clients/src/main/java/org/apache/kafka/common/protocol/Protocol.java b/clients/src/main/java/org/apache/kafka/common/protocol/Protocol.java index 383332b93f00f..404d12903eae5 100644 --- a/clients/src/main/java/org/apache/kafka/common/protocol/Protocol.java +++ b/clients/src/main/java/org/apache/kafka/common/protocol/Protocol.java @@ -78,6 +78,9 @@ public class Protocol { "topics that don't exist will be created by the broker. " + "Otherwise, no topics will be created by the broker.")); + /* The v5 metadata request is the same as v4. An additional field for offline_replicas has been added to the v5 metadata response */ + public static final Schema METADATA_REQUEST_V5 = METADATA_REQUEST_V4; + public static final Schema METADATA_BROKER_V0 = new Schema(new Field("node_id", INT32, "The broker id."), new Field("host", STRING, "The hostname of the broker."), new Field("port", INT32, @@ -121,12 +124,38 @@ public class Protocol { public static final Schema PARTITION_METADATA_V1 = PARTITION_METADATA_V0; + public static final Schema PARTITION_METADATA_V2 = new Schema(new Field("partition_error_code", + INT16, + "The error code for the partition, if any."), + new Field("partition_id", + INT32, + "The id of the partition."), + new Field("leader", + INT32, + "The id of the broker acting as leader for this partition."), + new Field("replicas", + new ArrayOf(INT32), + "The set of all nodes that host this partition."), + new Field("isr", + new ArrayOf(INT32), + "The set of nodes that are in sync with the leader for this partition."), + new Field("offline_replicas", + new ArrayOf(INT32), + "The set of offline replicas of this partition.")); + public static final Schema TOPIC_METADATA_V1 = new Schema(new Field("topic_error_code", INT16, "The error code for the given topic."), new Field("topic", STRING, "The name of the topic"), new Field("is_internal", BOOLEAN, "Indicates if the topic is considered a Kafka internal topic"), new Field("partition_metadata", new ArrayOf(PARTITION_METADATA_V1), - "Metadata for each partition of the topic.")); + "Metadata for each partition of the topic.")); + + public static final Schema TOPIC_METADATA_V2 = new Schema(new Field("topic_error_code", INT16, "The error code for the given topic."), + new Field("topic", STRING, "The name of the topic"), + new Field("is_internal", BOOLEAN, + "Indicates if the topic is considered a Kafka internal topic"), + new Field("partition_metadata", new ArrayOf(PARTITION_METADATA_V2), + "Metadata for each partition of the topic.")); public static final Schema METADATA_RESPONSE_V1 = new Schema(new Field("brokers", new ArrayOf(METADATA_BROKER_V1), "Host and port information for all brokers."), @@ -154,8 +183,19 @@ public class Protocol { public static final Schema METADATA_RESPONSE_V4 = METADATA_RESPONSE_V3; - public static final Schema[] METADATA_REQUEST = {METADATA_REQUEST_V0, METADATA_REQUEST_V1, METADATA_REQUEST_V2, METADATA_REQUEST_V3, METADATA_REQUEST_V4}; - public static final Schema[] METADATA_RESPONSE = {METADATA_RESPONSE_V0, METADATA_RESPONSE_V1, METADATA_RESPONSE_V2, METADATA_RESPONSE_V3, METADATA_RESPONSE_V4}; + // METADATA_RESPONSE_V5 added a per-partition offline_replicas field. This field specifies the list of replicas that are offline. + public static final Schema METADATA_RESPONSE_V5 = new Schema( + newThrottleTimeField(), + new Field("brokers", new ArrayOf(METADATA_BROKER_V1), + "Host and port information for all brokers."), + new Field("cluster_id", NULLABLE_STRING, + "The cluster id that this broker belongs to."), + new Field("controller_id", INT32, + "The broker id of the controller broker."), + new Field("topic_metadata", new ArrayOf(TOPIC_METADATA_V2))); + + public static final Schema[] METADATA_REQUEST = {METADATA_REQUEST_V0, METADATA_REQUEST_V1, METADATA_REQUEST_V2, METADATA_REQUEST_V3, METADATA_REQUEST_V4, METADATA_REQUEST_V5}; + public static final Schema[] METADATA_RESPONSE = {METADATA_RESPONSE_V0, METADATA_RESPONSE_V1, METADATA_RESPONSE_V2, METADATA_RESPONSE_V3, METADATA_RESPONSE_V4, METADATA_RESPONSE_V5}; /* Produce api */ @@ -205,6 +245,8 @@ public class Protocol { new Field("timeout", INT32, "The time to await a response in ms."), new Field("topic_data", new ArrayOf(TOPIC_PRODUCE_DATA_V0))); + public static final Schema PRODUCE_REQUEST_V4 = PRODUCE_REQUEST_V3; + public static final Schema PRODUCE_RESPONSE_V1 = new Schema(new Field("responses", new ArrayOf(new Schema(new Field("topic", STRING), new Field("partition_responses", @@ -236,10 +278,18 @@ public class Protocol { "If LogAppendTime is used for the topic, the timestamp will be " + "the broker local time when the messages are appended."))))))), newThrottleTimeField()); + public static final Schema PRODUCE_RESPONSE_V3 = PRODUCE_RESPONSE_V2; - public static final Schema[] PRODUCE_REQUEST = {PRODUCE_REQUEST_V0, PRODUCE_REQUEST_V1, PRODUCE_REQUEST_V2, PRODUCE_REQUEST_V3}; - public static final Schema[] PRODUCE_RESPONSE = {PRODUCE_RESPONSE_V0, PRODUCE_RESPONSE_V1, PRODUCE_RESPONSE_V2, PRODUCE_RESPONSE_V3}; + /** + * The body of PRODUCE_RESPONSE_V4 is the same as PRODUCE_RESPONSE_V3. + * The version number is bumped up to indicate that the client supports RetriableUnknownException. + * The KafkaStorageException will be translated to NotLeaderForPartitionException if version <= 3 + */ + public static final Schema PRODUCE_RESPONSE_V4 = PRODUCE_RESPONSE_V3; + + public static final Schema[] PRODUCE_REQUEST = {PRODUCE_REQUEST_V0, PRODUCE_REQUEST_V1, PRODUCE_REQUEST_V2, PRODUCE_REQUEST_V3, PRODUCE_REQUEST_V4}; + public static final Schema[] PRODUCE_RESPONSE = {PRODUCE_RESPONSE_V0, PRODUCE_RESPONSE_V1, PRODUCE_RESPONSE_V2, PRODUCE_RESPONSE_V3, PRODUCE_RESPONSE_V4}; /* Offset commit api */ public static final Schema OFFSET_COMMIT_REQUEST_PARTITION_V0 = new Schema(new Field("partition", @@ -666,6 +716,8 @@ public class Protocol { new ArrayOf(FETCH_REQUEST_TOPIC_V5), "Topics to fetch in the order provided.")); + public static final Schema FETCH_REQUEST_V6 = FETCH_REQUEST_V5; + public static final Schema FETCH_RESPONSE_PARTITION_HEADER_V0 = new Schema(new Field("partition", INT32, "Topic partition id."), @@ -692,7 +744,6 @@ public class Protocol { public static final Schema FETCH_RESPONSE_V2 = FETCH_RESPONSE_V1; public static final Schema FETCH_RESPONSE_V3 = FETCH_RESPONSE_V2; - // The v4 Fetch Response adds features for transactional consumption (the aborted transaction list and the // last stable offset). It also exposes messages with magic v2 (along with older formats). private static final Schema FETCH_RESPONSE_ABORTED_TRANSACTION_V4 = new Schema( @@ -759,8 +810,15 @@ public class Protocol { newThrottleTimeField(), new Field("responses", new ArrayOf(FETCH_RESPONSE_TOPIC_V5))); - public static final Schema[] FETCH_REQUEST = {FETCH_REQUEST_V0, FETCH_REQUEST_V1, FETCH_REQUEST_V2, FETCH_REQUEST_V3, FETCH_REQUEST_V4, FETCH_REQUEST_V5}; - public static final Schema[] FETCH_RESPONSE = {FETCH_RESPONSE_V0, FETCH_RESPONSE_V1, FETCH_RESPONSE_V2, FETCH_RESPONSE_V3, FETCH_RESPONSE_V4, FETCH_RESPONSE_V5}; + /** + * The body of FETCH_RESPONSE_V6 is the same as FETCH_RESPONSE_V5. + * The version number is bumped up to indicate that the client supports RetriableUnknownException. + * The KafkaStorageException will be translated to NotLeaderForPartitionException if version <= 5 + */ + public static final Schema FETCH_RESPONSE_V6 = FETCH_RESPONSE_V5; + + public static final Schema[] FETCH_REQUEST = {FETCH_REQUEST_V0, FETCH_REQUEST_V1, FETCH_REQUEST_V2, FETCH_REQUEST_V3, FETCH_REQUEST_V4, FETCH_REQUEST_V5, FETCH_REQUEST_V6}; + public static final Schema[] FETCH_RESPONSE = {FETCH_RESPONSE_V0, FETCH_RESPONSE_V1, FETCH_RESPONSE_V2, FETCH_RESPONSE_V3, FETCH_RESPONSE_V4, FETCH_RESPONSE_V5, FETCH_RESPONSE_V6}; /* List groups api */ public static final Schema LIST_GROUPS_REQUEST_V0 = new Schema(); @@ -1039,6 +1097,17 @@ public class Protocol { new Field("zk_version", INT32, "The ZK version."), new Field("replicas", new ArrayOf(INT32), "The replica ids.")); + public static final Schema LEADER_AND_ISR_REQUEST_PARTITION_STATE_V1 = + new Schema(new Field("topic", STRING, "Topic name."), + new Field("partition", INT32, "Topic partition id."), + new Field("controller_epoch", INT32, "The controller epoch."), + new Field("leader", INT32, "The broker id for the leader."), + new Field("leader_epoch", INT32, "The leader epoch."), + new Field("isr", new ArrayOf(INT32), "The in sync replica ids."), + new Field("zk_version", INT32, "The ZK version."), + new Field("replicas", new ArrayOf(INT32), "The replica ids."), + new Field("is_new", BOOLEAN, "Whether the replica should have existed on the broker or not")); + public static final Schema LEADER_AND_ISR_REQUEST_LIVE_LEADER_V0 = new Schema(new Field("id", INT32, "The broker id."), new Field("host", STRING, "The hostname of the broker."), @@ -1050,6 +1119,13 @@ public class Protocol { new ArrayOf(LEADER_AND_ISR_REQUEST_PARTITION_STATE_V0)), new Field("live_leaders", new ArrayOf(LEADER_AND_ISR_REQUEST_LIVE_LEADER_V0))); + // LEADER_AND_ISR_REQUEST_V1 added a per-partition is_new field. This field specifies Whether the replica should have existed on the broker or not. + public static final Schema LEADER_AND_ISR_REQUEST_V1 = new Schema(new Field("controller_id", INT32, "The controller id."), + new Field("controller_epoch", INT32, "The controller epoch."), + new Field("partition_states", + new ArrayOf(LEADER_AND_ISR_REQUEST_PARTITION_STATE_V1)), + new Field("live_leaders", new ArrayOf(LEADER_AND_ISR_REQUEST_LIVE_LEADER_V0))); + public static final Schema LEADER_AND_ISR_RESPONSE_PARTITION_V0 = new Schema(new Field("topic", STRING, "Topic name."), new Field("partition", INT32, "Topic partition id."), new Field("error_code", INT16, "Error code.")); @@ -1058,8 +1134,10 @@ public class Protocol { new Field("partitions", new ArrayOf(LEADER_AND_ISR_RESPONSE_PARTITION_V0))); - public static final Schema[] LEADER_AND_ISR_REQUEST = {LEADER_AND_ISR_REQUEST_V0}; - public static final Schema[] LEADER_AND_ISR_RESPONSE = {LEADER_AND_ISR_RESPONSE_V0}; + public static final Schema LEADER_AND_ISR_RESPONSE_V1 = LEADER_AND_ISR_RESPONSE_V0; + + public static final Schema[] LEADER_AND_ISR_REQUEST = {LEADER_AND_ISR_REQUEST_V0, LEADER_AND_ISR_REQUEST_V1}; + public static final Schema[] LEADER_AND_ISR_RESPONSE = {LEADER_AND_ISR_RESPONSE_V0, LEADER_AND_ISR_RESPONSE_V1}; /* Replica api */ public static final Schema STOP_REPLICA_REQUEST_PARTITION_V0 = new Schema(new Field("topic", STRING, "Topic name."), @@ -1141,6 +1219,18 @@ public class Protocol { public static final Schema UPDATE_METADATA_REQUEST_PARTITION_STATE_V3 = UPDATE_METADATA_REQUEST_PARTITION_STATE_V2; + // UPDATE_METADATA_REQUEST_PARTITION_STATE_V4 added a per-partition offline_replicas field. This field specifies the list of replicas that are offline. + public static final Schema UPDATE_METADATA_REQUEST_PARTITION_STATE_V4 = + new Schema(new Field("topic", STRING, "Topic name."), + new Field("partition", INT32, "Topic partition id."), + new Field("controller_epoch", INT32, "The controller epoch."), + new Field("leader", INT32, "The broker id for the leader."), + new Field("leader_epoch", INT32, "The leader epoch."), + new Field("isr", new ArrayOf(INT32), "The in sync replica ids."), + new Field("zk_version", INT32, "The ZK version."), + new Field("replicas", new ArrayOf(INT32), "The replica ids."), + new Field("offline_replicas", new ArrayOf(INT32), "The offline replica ids")); + public static final Schema UPDATE_METADATA_REQUEST_END_POINT_V3 = new Schema(new Field("port", INT32, "The port on which the broker accepts requests."), new Field("host", STRING, "The hostname of the broker."), @@ -1158,12 +1248,21 @@ public class Protocol { new Field("partition_states", new ArrayOf(UPDATE_METADATA_REQUEST_PARTITION_STATE_V3)), new Field("live_brokers", new ArrayOf(UPDATE_METADATA_REQUEST_BROKER_V3))); + // UPDATE_METADATA_REQUEST_V4 added a per-partition offline_replicas field. This field specifies the list of replicas that are offline. + public static final Schema UPDATE_METADATA_REQUEST_V4 = + new Schema(new Field("controller_id", INT32, "The controller id."), + new Field("controller_epoch", INT32, "The controller epoch."), + new Field("partition_states", new ArrayOf(UPDATE_METADATA_REQUEST_PARTITION_STATE_V4)), + new Field("live_brokers", new ArrayOf(UPDATE_METADATA_REQUEST_BROKER_V3))); + public static final Schema UPDATE_METADATA_RESPONSE_V3 = UPDATE_METADATA_RESPONSE_V2; + public static final Schema UPDATE_METADATA_RESPONSE_V4 = UPDATE_METADATA_RESPONSE_V3; + public static final Schema[] UPDATE_METADATA_REQUEST = {UPDATE_METADATA_REQUEST_V0, UPDATE_METADATA_REQUEST_V1, - UPDATE_METADATA_REQUEST_V2, UPDATE_METADATA_REQUEST_V3}; + UPDATE_METADATA_REQUEST_V2, UPDATE_METADATA_REQUEST_V3, UPDATE_METADATA_REQUEST_V4}; public static final Schema[] UPDATE_METADATA_RESPONSE = {UPDATE_METADATA_RESPONSE_V0, UPDATE_METADATA_RESPONSE_V1, - UPDATE_METADATA_RESPONSE_V2, UPDATE_METADATA_RESPONSE_V3}; + UPDATE_METADATA_RESPONSE_V2, UPDATE_METADATA_RESPONSE_V3, UPDATE_METADATA_RESPONSE_V4}; /* SASL handshake api */ public static final Schema SASL_HANDSHAKE_REQUEST_V0 = new Schema( diff --git a/clients/src/main/java/org/apache/kafka/common/record/FileRecords.java b/clients/src/main/java/org/apache/kafka/common/record/FileRecords.java index 35431d81592d6..a898634a4542a 100644 --- a/clients/src/main/java/org/apache/kafka/common/record/FileRecords.java +++ b/clients/src/main/java/org/apache/kafka/common/record/FileRecords.java @@ -171,6 +171,13 @@ public void close() throws IOException { channel.close(); } + /** + * Close file handlers used by the FileChannel but don't write to disk. This is used when the disk may have failed + */ + public void closeHandlers() throws IOException { + channel.close(); + } + /** * Delete this message set from the filesystem * @return True iff this message set was deleted. diff --git a/clients/src/main/java/org/apache/kafka/common/requests/FetchResponse.java b/clients/src/main/java/org/apache/kafka/common/requests/FetchResponse.java index 824a76fb60bbf..d1774a20833a4 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/FetchResponse.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/FetchResponse.java @@ -326,10 +326,13 @@ private static Struct toStruct(short version, LinkedHashMap partitionArray = new ArrayList<>(); for (Map.Entry partitionEntry : topicEntry.partitions.entrySet()) { PartitionData fetchPartitionData = partitionEntry.getValue(); + short errorCode = fetchPartitionData.error.code(); + if (errorCode == Errors.KAFKA_STORAGE_ERROR.code() && version <= 5) + errorCode = Errors.NOT_LEADER_FOR_PARTITION.code(); Struct partitionData = topicData.instance(PARTITIONS_KEY_NAME); Struct partitionDataHeader = partitionData.instance(PARTITION_HEADER_KEY_NAME); partitionDataHeader.set(PARTITION_KEY_NAME, partitionEntry.getKey()); - partitionDataHeader.set(ERROR_CODE_KEY_NAME, fetchPartitionData.error.code()); + partitionDataHeader.set(ERROR_CODE_KEY_NAME, errorCode); partitionDataHeader.set(HIGH_WATERMARK_KEY_NAME, fetchPartitionData.highWatermark); if (partitionDataHeader.hasField(LAST_STABLE_OFFSET_KEY_NAME)) { diff --git a/clients/src/main/java/org/apache/kafka/common/requests/LeaderAndIsrRequest.java b/clients/src/main/java/org/apache/kafka/common/requests/LeaderAndIsrRequest.java index 1fdb4a2be49d4..733c9af39da62 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/LeaderAndIsrRequest.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/LeaderAndIsrRequest.java @@ -45,6 +45,7 @@ public class LeaderAndIsrRequest extends AbstractRequest { private static final String ISR_KEY_NAME = "isr"; private static final String ZK_VERSION_KEY_NAME = "zk_version"; private static final String REPLICAS_KEY_NAME = "replicas"; + private static final String IS_NEW_KEY_NAME = "is_new"; // live_leaders key names private static final String END_POINT_ID_KEY_NAME = "id"; @@ -57,9 +58,9 @@ public static class Builder extends AbstractRequest.Builder private final Map partitionStates; private final Set liveLeaders; - public Builder(int controllerId, int controllerEpoch, + public Builder(short version, int controllerId, int controllerEpoch, Map partitionStates, Set liveLeaders) { - super(ApiKeys.LEADER_AND_ISR); + super(ApiKeys.LEADER_AND_ISR, version); this.controllerId = controllerId; this.controllerEpoch = controllerEpoch; this.partitionStates = partitionStates; @@ -121,10 +122,10 @@ public LeaderAndIsrRequest(Struct struct, short version) { List replicas = new ArrayList<>(replicasArray.length); for (Object r : replicasArray) replicas.add((Integer) r); + boolean isNew = partitionStateData.hasField(IS_NEW_KEY_NAME) ? partitionStateData.getBoolean(IS_NEW_KEY_NAME) : false; - PartitionState partitionState = new PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas); + PartitionState partitionState = new PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas, isNew); partitionStates.put(new TopicPartition(topic, partition), partitionState); - } Set leaders = new HashSet<>(); @@ -162,6 +163,8 @@ protected Struct toStruct() { partitionStateData.set(ISR_KEY_NAME, partitionState.isr.toArray()); partitionStateData.set(ZK_VERSION_KEY_NAME, partitionState.zkVersion); partitionStateData.set(REPLICAS_KEY_NAME, partitionState.replicas.toArray()); + if (partitionStateData.hasField(IS_NEW_KEY_NAME)) + partitionStateData.set(IS_NEW_KEY_NAME, partitionState.isNew); partitionStatesData.add(partitionStateData); } struct.set(PARTITION_STATES_KEY_NAME, partitionStatesData.toArray()); @@ -188,6 +191,7 @@ public AbstractResponse getErrorResponse(int throttleTimeMs, Throwable e) { short versionId = version(); switch (versionId) { case 0: + case 1: return new LeaderAndIsrResponse(Errors.NONE, responses); default: throw new IllegalArgumentException(String.format("Version %d is not valid. Valid versions for %s are 0 to %d", diff --git a/clients/src/main/java/org/apache/kafka/common/requests/MetadataResponse.java b/clients/src/main/java/org/apache/kafka/common/requests/MetadataResponse.java index b79876464e0f1..66559340bf691 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/MetadataResponse.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/MetadataResponse.java @@ -80,6 +80,7 @@ public class MetadataResponse extends AbstractResponse { private static final String LEADER_KEY_NAME = "leader"; private static final String REPLICAS_KEY_NAME = "replicas"; private static final String ISR_KEY_NAME = "isr"; + private static final String OFFLINE_REPLICAS_KEY_NAME = "offline_replicas"; private final int throttleTimeMs; private final Collection brokers; @@ -149,26 +150,18 @@ public MetadataResponse(Struct struct) { int partition = partitionInfo.getInt(PARTITION_KEY_NAME); int leader = partitionInfo.getInt(LEADER_KEY_NAME); Node leaderNode = leader == -1 ? null : brokers.get(leader); - Object[] replicas = (Object[]) partitionInfo.get(REPLICAS_KEY_NAME); - List replicaNodes = new ArrayList<>(replicas.length); - for (Object replicaNodeId : replicas) { - if (brokers.containsKey(replicaNodeId)) - replicaNodes.add(brokers.get(replicaNodeId)); - else - replicaNodes.add(new Node((int) replicaNodeId, "", -1)); - } + Object[] replicas = (Object[]) partitionInfo.get(REPLICAS_KEY_NAME); + List replicaNodes = convertToNodes(brokers, replicas); Object[] isr = (Object[]) partitionInfo.get(ISR_KEY_NAME); - List isrNodes = new ArrayList<>(isr.length); - for (Object isrNode : isr) { - if (brokers.containsKey(isrNode)) - isrNodes.add(brokers.get(isrNode)); - else - isrNodes.add(new Node((int) isrNode, "", -1)); - } + List isrNodes = convertToNodes(brokers, isr); + + Object[] offlineReplicas = partitionInfo.hasField(OFFLINE_REPLICAS_KEY_NAME) ? + (Object[]) partitionInfo.get(OFFLINE_REPLICAS_KEY_NAME) : new Object[0]; + List offlineNodes = convertToNodes(brokers, offlineReplicas); - partitionMetadata.add(new PartitionMetadata(partitionError, partition, leaderNode, replicaNodes, isrNodes)); + partitionMetadata.add(new PartitionMetadata(partitionError, partition, leaderNode, replicaNodes, isrNodes, offlineNodes)); } topicMetadata.add(new TopicMetadata(topicError, topic, isInternal, partitionMetadata)); @@ -179,6 +172,16 @@ public MetadataResponse(Struct struct) { this.topicMetadata = topicMetadata; } + private List convertToNodes(Map brokers, Object[] brokerIds) { + List nodes = new ArrayList<>(brokerIds.length); + for (Object brokerId : brokerIds) + if (brokers.containsKey(brokerId)) + nodes.add(brokers.get(brokerId)); + else + nodes.add(new Node((int) brokerId, "", -1)); + return nodes; + } + private Node getControllerNode(int controllerId, Collection brokers) { for (Node broker : brokers) { if (broker.id() == controllerId) @@ -256,7 +259,8 @@ public Cluster cluster() { partitionMetadata.partition, partitionMetadata.leader, partitionMetadata.replicas.toArray(new Node[0]), - partitionMetadata.isr.toArray(new Node[0]))); + partitionMetadata.isr.toArray(new Node[0]), + partitionMetadata.offlineReplicas.toArray(new Node[0]))); } } @@ -334,23 +338,27 @@ public List partitionMetadata() { } + // This is used to describe MetadataResponsePartitionState public static class PartitionMetadata { private final Errors error; private final int partition; private final Node leader; private final List replicas; private final List isr; + private final List offlineReplicas; public PartitionMetadata(Errors error, int partition, Node leader, List replicas, - List isr) { + List isr, + List offlineReplicas) { this.error = error; this.partition = partition; this.leader = leader; this.replicas = replicas; this.isr = isr; + this.offlineReplicas = offlineReplicas; } public Errors error() { @@ -373,6 +381,10 @@ public List isr() { return isr; } + public List offlineReplicas() { + return offlineReplicas; + } + @Override public String toString() { return "(type=PartitionMetadata," + @@ -433,6 +445,12 @@ protected Struct toStruct(short version) { for (Node node : partitionMetadata.isr) isr.add(node.id()); partitionData.set(ISR_KEY_NAME, isr.toArray()); + if (partitionData.hasField(OFFLINE_REPLICAS_KEY_NAME)) { + ArrayList offlineReplicas = new ArrayList<>(partitionMetadata.offlineReplicas.size()); + for (Node node : partitionMetadata.offlineReplicas) + offlineReplicas.add(node.id()); + partitionData.set(OFFLINE_REPLICAS_KEY_NAME, offlineReplicas.toArray()); + } partitionMetadataArray.add(partitionData); } diff --git a/clients/src/main/java/org/apache/kafka/common/requests/PartitionState.java b/clients/src/main/java/org/apache/kafka/common/requests/PartitionState.java index 394a60f6010d6..8ca7c1f638d72 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/PartitionState.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/PartitionState.java @@ -20,6 +20,7 @@ import java.util.List; +// This is used to describe LeaderAndIsrPartitionInfo public class PartitionState { public final int controllerEpoch; public final int leader; @@ -27,14 +28,16 @@ public class PartitionState { public final List isr; public final int zkVersion; public final List replicas; + public final boolean isNew; - public PartitionState(int controllerEpoch, int leader, int leaderEpoch, List isr, int zkVersion, List replicas) { + public PartitionState(int controllerEpoch, int leader, int leaderEpoch, List isr, int zkVersion, List replicas, boolean isNew) { this.controllerEpoch = controllerEpoch; this.leader = leader; this.leaderEpoch = leaderEpoch; this.isr = isr; this.zkVersion = zkVersion; this.replicas = replicas; + this.isNew = isNew; } @Override @@ -44,6 +47,7 @@ public String toString() { ", leaderEpoch=" + leaderEpoch + ", isr=" + Utils.join(isr, ",") + ", zkVersion=" + zkVersion + - ", replicas=" + Utils.join(replicas, ",") + ")"; + ", replicas=" + Utils.join(replicas, ",") + + ", isNew=" + isNew + ")"; } } diff --git a/clients/src/main/java/org/apache/kafka/common/requests/ProduceRequest.java b/clients/src/main/java/org/apache/kafka/common/requests/ProduceRequest.java index 3d696c1bec692..9b5a428a8ef44 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/ProduceRequest.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/ProduceRequest.java @@ -246,6 +246,7 @@ public ProduceResponse getErrorResponse(int throttleTimeMs, Throwable e) { case 1: case 2: case 3: + case 4: return new ProduceResponse(responseMap, throttleTimeMs); default: throw new IllegalArgumentException(String.format("Version %d is not valid. Valid versions for %s are 0 to %d", @@ -309,6 +310,9 @@ public static byte requiredMagicForVersion(short produceRequestVersion) { case 3: return RecordBatch.MAGIC_VALUE_V2; + case 4: + return RecordBatch.MAGIC_VALUE_V2; + default: // raise an exception if the version has not been explicitly added to this method. // this ensures that we cannot accidentally use the wrong magic value if we forget diff --git a/clients/src/main/java/org/apache/kafka/common/requests/ProduceResponse.java b/clients/src/main/java/org/apache/kafka/common/requests/ProduceResponse.java index d42f1c6dde73d..304ae22d22a05 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/ProduceResponse.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/ProduceResponse.java @@ -122,9 +122,12 @@ protected Struct toStruct(short version) { List partitionArray = new ArrayList<>(); for (Map.Entry partitionEntry : entry.getValue().entrySet()) { PartitionResponse part = partitionEntry.getValue(); + short errorCode = part.error.code(); + if (errorCode == Errors.KAFKA_STORAGE_ERROR.code() && version <= 3) + errorCode = Errors.NOT_LEADER_FOR_PARTITION.code(); Struct partStruct = topicData.instance(PARTITION_RESPONSES_KEY_NAME) .set(PARTITION_KEY_NAME, partitionEntry.getKey()) - .set(ERROR_CODE_KEY_NAME, part.error.code()) + .set(ERROR_CODE_KEY_NAME, errorCode) .set(BASE_OFFSET_KEY_NAME, part.baseOffset); if (partStruct.hasField(LOG_APPEND_TIME_KEY_NAME)) partStruct.set(LOG_APPEND_TIME_KEY_NAME, part.logAppendTime); diff --git a/clients/src/main/java/org/apache/kafka/common/requests/UpdateMetadataRequest.java b/clients/src/main/java/org/apache/kafka/common/requests/UpdateMetadataRequest.java index 8f9b592c431d4..69943b4ab1f7c 100644 --- a/clients/src/main/java/org/apache/kafka/common/requests/UpdateMetadataRequest.java +++ b/clients/src/main/java/org/apache/kafka/common/requests/UpdateMetadataRequest.java @@ -27,6 +27,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -74,6 +75,43 @@ public String toString() { } } + public static final class PartitionState { + public final int controllerEpoch; + public final int leader; + public final int leaderEpoch; + public final List isr; + public final int zkVersion; + public final List replicas; + public final List offlineReplicas; + + public PartitionState(int controllerEpoch, + int leader, + int leaderEpoch, + List isr, + int zkVersion, + List replicas, + List offlineReplicas) { + this.controllerEpoch = controllerEpoch; + this.leader = leader; + this.leaderEpoch = leaderEpoch; + this.isr = isr; + this.zkVersion = zkVersion; + this.replicas = replicas; + this.offlineReplicas = offlineReplicas; + } + + @Override + public String toString() { + return "UpdateMetadataRequestPartitionState(controllerEpoch=" + controllerEpoch + + ", leader=" + leader + + ", leaderEpoch=" + leaderEpoch + + ", isr=" + Arrays.toString(isr.toArray()) + + ", zkVersion=" + zkVersion + + ", replicas=" + Arrays.toString(replicas.toArray()) + + ", offlineReplicas=" + Arrays.toString(replicas.toArray()) + ")"; + } + } + public static final class Broker { public final int id; public final List endPoints; @@ -121,7 +159,7 @@ public String toString() { private static final String PARTITION_STATES_KEY_NAME = "partition_states"; private static final String LIVE_BROKERS_KEY_NAME = "live_brokers"; - // PartitionState key names + // UpdateMetadataRequestPartitionState key names private static final String TOPIC_KEY_NAME = "topic"; private static final String PARTITION_KEY_NAME = "partition"; private static final String LEADER_KEY_NAME = "leader"; @@ -129,6 +167,7 @@ public String toString() { private static final String ISR_KEY_NAME = "isr"; private static final String ZK_VERSION_KEY_NAME = "zk_version"; private static final String REPLICAS_KEY_NAME = "replicas"; + private static final String OFFLINE_REPLICAS_KEY_NAME = "offline_replicas"; // Broker key names private static final String BROKER_ID_KEY_NAME = "id"; @@ -146,8 +185,8 @@ public String toString() { private final Map partitionStates; private final Set liveBrokers; - private UpdateMetadataRequest(short version, int controllerId, int controllerEpoch, Map partitionStates, Set liveBrokers) { + private UpdateMetadataRequest(short version, int controllerId, int controllerEpoch, + Map partitionStates, Set liveBrokers) { super(version); this.controllerId = controllerId; this.controllerEpoch = controllerEpoch; @@ -178,9 +217,16 @@ public UpdateMetadataRequest(Struct struct, short versionId) { for (Object r : replicasArray) replicas.add((Integer) r); - PartitionState partitionState = new PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas); - partitionStates.put(new TopicPartition(topic, partition), partitionState); + List offlineReplicas = new ArrayList<>(); + if (partitionStateData.hasField(OFFLINE_REPLICAS_KEY_NAME)) { + Object[] offlineReplicasArray = partitionStateData.getArray(OFFLINE_REPLICAS_KEY_NAME); + for (Object r : offlineReplicasArray) + offlineReplicas.add((Integer) r); + } + PartitionState partitionState = + new PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas, offlineReplicas); + partitionStates.put(new TopicPartition(topic, partition), partitionState); } Set liveBrokers = new HashSet<>(); @@ -245,6 +291,8 @@ protected Struct toStruct() { partitionStateData.set(ISR_KEY_NAME, partitionState.isr.toArray()); partitionStateData.set(ZK_VERSION_KEY_NAME, partitionState.zkVersion); partitionStateData.set(REPLICAS_KEY_NAME, partitionState.replicas.toArray()); + if (partitionStateData.hasField(OFFLINE_REPLICAS_KEY_NAME)) + partitionStateData.set(OFFLINE_REPLICAS_KEY_NAME, partitionState.offlineReplicas.toArray()); partitionStatesData.add(partitionStateData); } struct.set(PARTITION_STATES_KEY_NAME, partitionStatesData.toArray()); diff --git a/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java b/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java index c0edcfd909b32..bb3162ada2905 100644 --- a/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java +++ b/clients/src/test/java/org/apache/kafka/clients/consumer/internals/FetcherTest.java @@ -1979,7 +1979,8 @@ private MetadataResponse newMetadataResponse(String topic, Errors error) { partitionInfo.partition(), partitionInfo.leader(), Arrays.asList(partitionInfo.replicas()), - Arrays.asList(partitionInfo.inSyncReplicas()))); + Arrays.asList(partitionInfo.inSyncReplicas()), + Arrays.asList(partitionInfo.offlineReplicas()))); } } diff --git a/clients/src/test/java/org/apache/kafka/common/PartitionInfoTest.java b/clients/src/test/java/org/apache/kafka/common/PartitionInfoTest.java index 7836023fa68d2..7511d64c44143 100644 --- a/clients/src/test/java/org/apache/kafka/common/PartitionInfoTest.java +++ b/clients/src/test/java/org/apache/kafka/common/PartitionInfoTest.java @@ -20,20 +20,21 @@ import org.junit.Test; public class PartitionInfoTest { - + @Test public void testToString() { String topic = "sample"; int partition = 0; Node leader = new Node(0, "localhost", 9092); Node r1 = new Node(1, "localhost", 9093); - Node r2 = new Node(2, "localhost", 9094); + Node r2 = new Node(2, "localhost", 9094); Node[] replicas = new Node[] {leader, r1, r2}; - Node[] inSyncReplicas = new Node[] {leader, r1, r2}; - PartitionInfo partitionInfo = new PartitionInfo(topic, partition, leader, replicas, inSyncReplicas); - - String expected = String.format("Partition(topic = %s, partition = %d, leader = %s, replicas = %s, isr = %s)", - topic, partition, leader.idString(), "[0,1,2]", "[0,1,2]"); + Node[] inSyncReplicas = new Node[] {leader, r1}; + Node[] offlineReplicas = new Node[] {r2}; + PartitionInfo partitionInfo = new PartitionInfo(topic, partition, leader, replicas, inSyncReplicas, offlineReplicas); + + String expected = String.format("Partition(topic = %s, partition = %d, leader = %s, replicas = %s, isr = %s, offlineReplicas = %s)", + topic, partition, leader.idString(), "[0,1,2]", "[0,1]", "[2]"); Assert.assertEquals(expected, partitionInfo.toString()); } diff --git a/clients/src/test/java/org/apache/kafka/common/requests/RequestResponseTest.java b/clients/src/test/java/org/apache/kafka/common/requests/RequestResponseTest.java index a3c277f247c3e..5ed3fff3f216d 100644 --- a/clients/src/test/java/org/apache/kafka/common/requests/RequestResponseTest.java +++ b/clients/src/test/java/org/apache/kafka/common/requests/RequestResponseTest.java @@ -723,10 +723,11 @@ private MetadataResponse createMetadataResponse() { Node node = new Node(1, "host1", 1001); List replicas = asList(node); List isr = asList(node); + List offlineReplicas = asList(); List allTopicMetadata = new ArrayList<>(); allTopicMetadata.add(new MetadataResponse.TopicMetadata(Errors.NONE, "__consumer_offsets", true, - asList(new MetadataResponse.PartitionMetadata(Errors.NONE, 1, node, replicas, isr)))); + asList(new MetadataResponse.PartitionMetadata(Errors.NONE, 1, node, replicas, isr, offlineReplicas)))); allTopicMetadata.add(new MetadataResponse.TopicMetadata(Errors.LEADER_NOT_AVAILABLE, "topic2", false, Collections.emptyList())); @@ -807,18 +808,18 @@ private LeaderAndIsrRequest createLeaderAndIsrRequest() { List isr = asList(1, 2); List replicas = asList(1, 2, 3, 4); partitionStates.put(new TopicPartition("topic5", 105), - new PartitionState(0, 2, 1, new ArrayList<>(isr), 2, replicas)); + new PartitionState(0, 2, 1, new ArrayList<>(isr), 2, replicas, false)); partitionStates.put(new TopicPartition("topic5", 1), - new PartitionState(1, 1, 1, new ArrayList<>(isr), 2, replicas)); + new PartitionState(1, 1, 1, new ArrayList<>(isr), 2, replicas, false)); partitionStates.put(new TopicPartition("topic20", 1), - new PartitionState(1, 0, 1, new ArrayList<>(isr), 2, replicas)); + new PartitionState(1, 0, 1, new ArrayList<>(isr), 2, replicas, false)); Set leaders = Utils.mkSet( new Node(0, "test0", 1223), new Node(1, "test1", 1223) ); - - return new LeaderAndIsrRequest.Builder(1, 10, partitionStates, leaders).build(); + short version = ApiKeys.LEADER_AND_ISR.latestVersion(); + return new LeaderAndIsrRequest.Builder(version, 1, 10, partitionStates, leaders).build(); } private LeaderAndIsrResponse createLeaderAndIsrResponse() { @@ -828,15 +829,16 @@ private LeaderAndIsrResponse createLeaderAndIsrResponse() { } private UpdateMetadataRequest createUpdateMetadataRequest(int version, String rack) { - Map partitionStates = new HashMap<>(); + Map partitionStates = new HashMap<>(); List isr = asList(1, 2); List replicas = asList(1, 2, 3, 4); + List offlineReplicas = asList(); partitionStates.put(new TopicPartition("topic5", 105), - new PartitionState(0, 2, 1, new ArrayList<>(isr), 2, replicas)); + new UpdateMetadataRequest.PartitionState(0, 2, 1, isr, 2, replicas, offlineReplicas)); partitionStates.put(new TopicPartition("topic5", 1), - new PartitionState(1, 1, 1, new ArrayList<>(isr), 2, replicas)); + new UpdateMetadataRequest.PartitionState(1, 1, 1, isr, 2, replicas, offlineReplicas)); partitionStates.put(new TopicPartition("topic20", 1), - new PartitionState(1, 0, 1, new ArrayList<>(isr), 2, replicas)); + new UpdateMetadataRequest.PartitionState(1, 0, 1, isr, 2, replicas, offlineReplicas)); SecurityProtocol plaintext = SecurityProtocol.PLAINTEXT; List endPoints1 = new ArrayList<>(); diff --git a/core/src/main/scala/kafka/admin/AdminUtils.scala b/core/src/main/scala/kafka/admin/AdminUtils.scala index 923ceb747fb6a..7de85e49a9064 100644 --- a/core/src/main/scala/kafka/admin/AdminUtils.scala +++ b/core/src/main/scala/kafka/admin/AdminUtils.scala @@ -408,7 +408,7 @@ object AdminUtils extends Logging with AdminUtilities { zkUtils.pathExists(getTopicPath(topic)) def getBrokerMetadatas(zkUtils: ZkUtils, rackAwareMode: RackAwareMode = RackAwareMode.Enforced, - brokerList: Option[Seq[Int]] = None): Seq[BrokerMetadata] = { + brokerList: Option[Seq[Int]] = None): Seq[BrokerMetadata] = { val allBrokers = zkUtils.getAllBrokersInCluster() val brokers = brokerList.map(brokerIds => allBrokers.filter(b => brokerIds.contains(b.id))).getOrElse(allBrokers) val brokersWithRack = brokers.filter(_.rack.nonEmpty) diff --git a/core/src/main/scala/kafka/api/ApiVersion.scala b/core/src/main/scala/kafka/api/ApiVersion.scala index 62d5d1261ccb3..62e4eff89950a 100644 --- a/core/src/main/scala/kafka/api/ApiVersion.scala +++ b/core/src/main/scala/kafka/api/ApiVersion.scala @@ -69,7 +69,10 @@ object ApiVersion { "0.11.0-IV1" -> KAFKA_0_11_0_IV1, // Introduced leader epoch fetches to the replica fetcher via KIP-101 "0.11.0-IV2" -> KAFKA_0_11_0_IV2, - "0.11.0" -> KAFKA_0_11_0_IV2 + "0.11.0" -> KAFKA_0_11_0_IV2, + // Introduced LeaderAndIsrRequest V1, UpdateMetadataRequest V4 and MetadataResponse V5 via KIP-112 + "0.11.1-IV0" -> KAFKA_0_11_1_IV0, + "0.11.1" -> KAFKA_0_11_1_IV0 ) private val versionPattern = "\\.".r @@ -171,3 +174,10 @@ case object KAFKA_0_11_0_IV2 extends ApiVersion { val messageFormatVersion: Byte = RecordBatch.MAGIC_VALUE_V2 val id: Int = 12 } + +case object KAFKA_0_11_1_IV0 extends ApiVersion { + val version: String = "0.11.1-IV0" + val messageFormatVersion: Byte = RecordBatch.MAGIC_VALUE_V2 + val id: Int = 13 +} + diff --git a/core/src/main/scala/kafka/api/LeaderAndIsr.scala b/core/src/main/scala/kafka/api/LeaderAndIsr.scala index 474d7a0033053..4933f7d3050b7 100644 --- a/core/src/main/scala/kafka/api/LeaderAndIsr.scala +++ b/core/src/main/scala/kafka/api/LeaderAndIsr.scala @@ -50,13 +50,28 @@ case class LeaderAndIsr(leader: Int, } } -case class PartitionStateInfo(leaderIsrAndControllerEpoch: LeaderIsrAndControllerEpoch, allReplicas: Seq[Int]) { +case class LeaderAndIsrPartitionState(leaderIsrAndControllerEpoch: LeaderIsrAndControllerEpoch, allReplicas: Seq[Int], isNew: Boolean) { override def toString: String = { val partitionStateInfo = new StringBuilder partitionStateInfo.append("(LeaderAndIsrInfo:" + leaderIsrAndControllerEpoch.toString) partitionStateInfo.append(",ReplicationFactor:" + allReplicas.size + ")") partitionStateInfo.append(",AllReplicas:" + allReplicas.mkString(",") + ")") + partitionStateInfo.append(",isNew:" + isNew + ")") + partitionStateInfo.toString() + } +} + +case class MetadataPartitionState(leaderIsrAndControllerEpoch: LeaderIsrAndControllerEpoch, allReplicas: Seq[Int], offlineReplicas: Seq[Int]) { + + def replicationFactor = allReplicas.size + + override def toString: String = { + val partitionStateInfo = new StringBuilder + partitionStateInfo.append("(LeaderAndIsrInfo:" + leaderIsrAndControllerEpoch.toString) + partitionStateInfo.append(",ReplicationFactor:" + replicationFactor + ")") + partitionStateInfo.append(",AllReplicas:" + allReplicas.mkString(",") + ")") + partitionStateInfo.append(",OfflineReplicas:" + offlineReplicas.mkString(",") + ")") partitionStateInfo.toString() } } diff --git a/core/src/main/scala/kafka/cluster/Partition.scala b/core/src/main/scala/kafka/cluster/Partition.scala index ebf314049c203..4ce7344a92438 100755 --- a/core/src/main/scala/kafka/cluster/Partition.scala +++ b/core/src/main/scala/kafka/cluster/Partition.scala @@ -16,9 +16,8 @@ */ package kafka.cluster -import java.io.IOException -import java.util.concurrent.locks.ReentrantReadWriteLock +import java.util.concurrent.locks.ReentrantReadWriteLock import com.yammer.metrics.core.Gauge import kafka.admin.AdminUtils import kafka.api.LeaderAndIsr @@ -114,13 +113,13 @@ class Partition(val topic: String, def isUnderReplicated: Boolean = isLeaderReplicaLocal && inSyncReplicas.size < assignedReplicas.size - def getOrCreateReplica(replicaId: Int = localBrokerId): Replica = { + def getOrCreateReplica(replicaId: Int = localBrokerId, isNew: Boolean = false): Replica = { assignedReplicaMap.getAndMaybePut(replicaId, { if (isReplicaLocal(replicaId)) { val config = LogConfig.fromProps(logManager.defaultConfig.originals, AdminUtils.fetchEntityConfig(zkUtils, ConfigType.Topic, topic)) - val log = logManager.createLog(topicPartition, config) - val checkpoint = replicaManager.highWatermarkCheckpoints(log.dir.getParentFile.getAbsolutePath) + val log = logManager.getOrCreateLog(topicPartition, config, isNew) + val checkpoint = replicaManager.highWatermarkCheckpoints(log.dir.getParent) val offsetMap = checkpoint.read if (!offsetMap.contains(topicPartition)) info(s"No checkpointed highwatermark is found for partition $topicPartition") @@ -151,14 +150,9 @@ class Partition(val topic: String, assignedReplicaMap.clear() inSyncReplicas = Set.empty[Replica] leaderReplicaIdOpt = None - try { - logManager.asyncDelete(topicPartition) - removePartitionMetrics() - } catch { - case e: IOException => - fatal(s"Error deleting the log for partition $topicPartition", e) - Exit.halt(1) - } + removePartitionMetrics() + // This call may throw exception if the log is on offline directory + logManager.asyncDelete(topicPartition) } } @@ -176,7 +170,7 @@ class Partition(val topic: String, // to maintain the decision maker controller's epoch in the zookeeper path controllerEpoch = partitionStateInfo.controllerEpoch // add replicas that are new - val newInSyncReplicas = partitionStateInfo.isr.asScala.map(r => getOrCreateReplica(r)).toSet + val newInSyncReplicas = partitionStateInfo.isr.asScala.map(r => getOrCreateReplica(r, partitionStateInfo.isNew)).toSet // remove assigned replicas that have been removed by the controller (assignedReplicas.map(_.brokerId) -- allReplicas).foreach(removeReplica) inSyncReplicas = newInSyncReplicas @@ -230,7 +224,7 @@ class Partition(val topic: String, // to maintain the decision maker controller's epoch in the zookeeper path controllerEpoch = partitionStateInfo.controllerEpoch // add replicas that are new - allReplicas.foreach(r => getOrCreateReplica(r)) + allReplicas.foreach(r => getOrCreateReplica(r, partitionStateInfo.isNew)) // remove assigned replicas that have been removed by the controller (assignedReplicas.map(_.brokerId) -- allReplicas).foreach(removeReplica) inSyncReplicas = Set.empty[Replica] @@ -557,7 +551,7 @@ class Partition(val topic: String, /** * remove deleted log metrics */ - private def removePartitionMetrics() { + def removePartitionMetrics() { removeMetric("UnderReplicated", tags) removeMetric("InSyncReplicasCount", tags) removeMetric("ReplicasCount", tags) diff --git a/core/src/main/scala/kafka/common/KafkaStorageException.scala b/core/src/main/scala/kafka/common/KafkaStorageException.scala index 21dd5835928a0..c246922eef13d 100644 --- a/core/src/main/scala/kafka/common/KafkaStorageException.scala +++ b/core/src/main/scala/kafka/common/KafkaStorageException.scala @@ -17,7 +17,7 @@ package kafka.common /** - * Kafka exception caused by real IOs. + * Kafka exception caused by disk-related IOException */ class KafkaStorageException(message: String, t: Throwable) extends RuntimeException(message, t) { def this(message: String) = this(message, null) diff --git a/core/src/main/scala/kafka/consumer/ConsumerFetcherThread.scala b/core/src/main/scala/kafka/consumer/ConsumerFetcherThread.scala index 4f14570dbadd7..9f62617628c15 100644 --- a/core/src/main/scala/kafka/consumer/ConsumerFetcherThread.scala +++ b/core/src/main/scala/kafka/consumer/ConsumerFetcherThread.scala @@ -96,9 +96,9 @@ class ConsumerFetcherThread(name: String, } // any logic for partitions whose leader has changed - def handlePartitionsWithErrors(partitions: Iterable[TopicPartition]) { - removePartitions(partitions.toSet) - consumerFetcherManager.addPartitionsWithError(partitions) + def handlePartitionsWithErrors(partitions: Map[TopicPartition, Option[Exception]]) { + removePartitions(partitions.keys.toSet) + consumerFetcherManager.addPartitionsWithError(partitions.keys) } protected def buildFetchRequest(partitionMap: collection.Seq[(TopicPartition, PartitionFetchState)]): FetchRequest = { diff --git a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala index ee8fa1eb27edd..eea14c4320498 100755 --- a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala +++ b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala @@ -31,7 +31,7 @@ import org.apache.kafka.common.metrics.Metrics import org.apache.kafka.common.network._ import org.apache.kafka.common.protocol.{ApiKeys, SecurityProtocol} import org.apache.kafka.common.requests.UpdateMetadataRequest.EndPoint -import org.apache.kafka.common.requests.{UpdateMetadataRequest, _} +import org.apache.kafka.common.requests._ import org.apache.kafka.common.security.JaasContext import org.apache.kafka.common.utils.Time import org.apache.kafka.common.{Node, TopicPartition, requests} @@ -281,10 +281,10 @@ class RequestSendThread(val controllerId: Int, class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging { val controllerContext = controller.controllerContext val controllerId: Int = controller.config.brokerId - val leaderAndIsrRequestMap = mutable.Map.empty[Int, mutable.Map[TopicPartition, PartitionStateInfo]] + val leaderAndIsrRequestMap = mutable.Map.empty[Int, mutable.Map[TopicPartition, LeaderAndIsrPartitionState]] val stopReplicaRequestMap = mutable.Map.empty[Int, Seq[StopReplicaRequestInfo]] val updateMetadataRequestBrokerSet = mutable.Set.empty[Int] - val updateMetadataRequestPartitionInfoMap = mutable.Map.empty[TopicPartition, PartitionStateInfo] + val updateMetadataRequestPartitionInfoMap = mutable.Map.empty[TopicPartition, MetadataPartitionState] private val stateChangeLogger = KafkaController.stateChangeLogger def newBatch() { @@ -310,12 +310,14 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging def addLeaderAndIsrRequestForBrokers(brokerIds: Seq[Int], topic: String, partition: Int, leaderIsrAndControllerEpoch: LeaderIsrAndControllerEpoch, - replicas: Seq[Int], callback: AbstractResponse => Unit = null) { + replicas: Seq[Int], callback: AbstractResponse => Unit = null, + isNew: Boolean = false) { val topicPartition = new TopicPartition(topic, partition) brokerIds.filter(_ >= 0).foreach { brokerId => val result = leaderAndIsrRequestMap.getOrElseUpdate(brokerId, mutable.Map.empty) - result.put(topicPartition, PartitionStateInfo(leaderIsrAndControllerEpoch, replicas)) + val alreadyNew = result.get(topicPartition).exists(_.isNew) + result.put(topicPartition, LeaderAndIsrPartitionState(leaderIsrAndControllerEpoch, replicas, isNew || alreadyNew)) } addUpdateMetadataRequestForBrokers(controllerContext.liveOrShuttingDownBrokerIds.toSeq, @@ -345,7 +347,7 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging leaderIsrAndControllerEpochOpt match { case Some(l @ LeaderIsrAndControllerEpoch(leaderAndIsr, controllerEpoch)) => val replicas = controllerContext.partitionReplicaAssignment(partition) - + val offlineReplicas = replicas.filter(!controllerContext.isReplicaOnline(_, partition)) val leaderIsrAndControllerEpoch = if (beingDeleted) { val leaderDuringDelete = LeaderAndIsr.duringDelete(leaderAndIsr.isr) LeaderIsrAndControllerEpoch(leaderDuringDelete, controllerEpoch) @@ -353,7 +355,7 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging l } - val partitionStateInfo = PartitionStateInfo(leaderIsrAndControllerEpoch, replicas) + val partitionStateInfo = MetadataPartitionState(leaderIsrAndControllerEpoch, replicas, offlineReplicas) updateMetadataRequestPartitionInfoMap.put(new TopicPartition(partition.topic, partition.partition), partitionStateInfo) case None => @@ -379,8 +381,12 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging def sendRequestsToBrokers(controllerEpoch: Int) { try { - leaderAndIsrRequestMap.foreach { case (broker, partitionStateInfos) => - partitionStateInfos.foreach { case (topicPartition, state) => + val leaderAndIsrRequestVersion: Short = + if (controller.config.interBrokerProtocolVersion >= KAFKA_0_11_1_IV0) 1 + else 0 + + leaderAndIsrRequestMap.foreach { case (broker, leaderAndIsrPartitionStates) => + leaderAndIsrPartitionStates.foreach { case (topicPartition, state) => val typeOfRequest = if (broker == state.leaderIsrAndControllerEpoch.leaderAndIsr.leader) "become-leader" else "become-follower" @@ -389,20 +395,21 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging state.leaderIsrAndControllerEpoch, broker, topicPartition.topic, topicPartition.partition)) } - val leaderIds = partitionStateInfos.map(_._2.leaderIsrAndControllerEpoch.leaderAndIsr.leader).toSet + val leaderIds = leaderAndIsrPartitionStates.map(_._2.leaderIsrAndControllerEpoch.leaderAndIsr.leader).toSet val leaders = controllerContext.liveOrShuttingDownBrokers.filter(b => leaderIds.contains(b.id)).map { _.getNode(controller.config.interBrokerListenerName) } - val partitionStates = partitionStateInfos.map { case (topicPartition, partitionStateInfo) => - val LeaderIsrAndControllerEpoch(leaderIsr, controllerEpoch) = partitionStateInfo.leaderIsrAndControllerEpoch + val partitionStates = leaderAndIsrPartitionStates.map { case (topicPartition, leaderAndIsrPartitionState) => + val LeaderIsrAndControllerEpoch(leaderIsr, controllerEpoch) = leaderAndIsrPartitionState.leaderIsrAndControllerEpoch val partitionState = new requests.PartitionState(controllerEpoch, leaderIsr.leader, leaderIsr.leaderEpoch, leaderIsr.isr.map(Integer.valueOf).asJava, leaderIsr.zkVersion, - partitionStateInfo.allReplicas.map(Integer.valueOf).asJava) + leaderAndIsrPartitionState.allReplicas.map(Integer.valueOf).asJava, leaderAndIsrPartitionState.isNew) topicPartition -> partitionState } - val leaderAndIsrRequest = new LeaderAndIsrRequest.Builder(controllerId, controllerEpoch, partitionStates.asJava, - leaders.asJava) - controller.sendRequest(broker, ApiKeys.LEADER_AND_ISR, leaderAndIsrRequest) + val leaderAndIsrRequestBuilder = new LeaderAndIsrRequest.Builder(leaderAndIsrRequestVersion, controllerId, + controllerEpoch, partitionStates.asJava, leaders.asJava) + controller.sendRequest(broker, ApiKeys.LEADER_AND_ISR, leaderAndIsrRequestBuilder, + (r: AbstractResponse) => controller.eventManager.put(controller.LeaderAndIsrResponseReceived(r, broker))) } leaderAndIsrRequestMap.clear() @@ -411,20 +418,21 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging updateMetadataRequestBrokerSet.toString(), p._1))) val partitionStates = updateMetadataRequestPartitionInfoMap.map { case (topicPartition, partitionStateInfo) => val LeaderIsrAndControllerEpoch(leaderIsr, controllerEpoch) = partitionStateInfo.leaderIsrAndControllerEpoch - val partitionState = new requests.PartitionState(controllerEpoch, leaderIsr.leader, + val partitionState = new UpdateMetadataRequest.PartitionState(controllerEpoch, leaderIsr.leader, leaderIsr.leaderEpoch, leaderIsr.isr.map(Integer.valueOf).asJava, leaderIsr.zkVersion, - partitionStateInfo.allReplicas.map(Integer.valueOf).asJava) + partitionStateInfo.allReplicas.map(Integer.valueOf).asJava, partitionStateInfo.offlineReplicas.map(Integer.valueOf).asJava) topicPartition -> partitionState } - val version: Short = - if (controller.config.interBrokerProtocolVersion >= KAFKA_0_10_2_IV0) 3 + val updateMetadataRequestVersion: Short = + if (controller.config.interBrokerProtocolVersion >= KAFKA_0_11_1_IV0) 4 + else if (controller.config.interBrokerProtocolVersion >= KAFKA_0_10_2_IV0) 3 else if (controller.config.interBrokerProtocolVersion >= KAFKA_0_10_0_IV1) 2 else if (controller.config.interBrokerProtocolVersion >= KAFKA_0_9_0) 1 else 0 val updateMetadataRequest = { - val liveBrokers = if (version == 0) { + val liveBrokers = if (updateMetadataRequestVersion == 0) { // Version 0 of UpdateMetadataRequest only supports PLAINTEXT. controllerContext.liveOrShuttingDownBrokers.map { broker => val securityProtocol = SecurityProtocol.PLAINTEXT @@ -441,7 +449,7 @@ class ControllerBrokerRequestBatch(controller: KafkaController) extends Logging new UpdateMetadataRequest.Broker(broker.id, endPoints.asJava, broker.rack.orNull) } } - new UpdateMetadataRequest.Builder(version, controllerId, controllerEpoch, partitionStates.asJava, + new UpdateMetadataRequest.Builder(updateMetadataRequestVersion, controllerId, controllerEpoch, partitionStates.asJava, liveBrokers.asJava) } diff --git a/core/src/main/scala/kafka/controller/ControllerState.scala b/core/src/main/scala/kafka/controller/ControllerState.scala index 2f690bbbdf8c9..74029b1b60137 100644 --- a/core/src/main/scala/kafka/controller/ControllerState.scala +++ b/core/src/main/scala/kafka/controller/ControllerState.scala @@ -78,6 +78,15 @@ object ControllerState { def value = 9 } + case object LeaderAndIsrResponseReceived extends ControllerState { + def value = 10 + } + + case object LogDirChange extends ControllerState { + def value = 11 + } + val values: Seq[ControllerState] = Seq(Idle, ControllerChange, BrokerChange, TopicChange, TopicDeletion, - PartitionReassignment, AutoLeaderBalance, ManualLeaderBalance, ControlledShutdown, IsrChange) + PartitionReassignment, AutoLeaderBalance, ManualLeaderBalance, ControlledShutdown, IsrChange, LeaderAndIsrResponseReceived, + LogDirChange) } diff --git a/core/src/main/scala/kafka/controller/KafkaController.scala b/core/src/main/scala/kafka/controller/KafkaController.scala index ff47f1458f251..a7d33a780fb68 100644 --- a/core/src/main/scala/kafka/controller/KafkaController.scala +++ b/core/src/main/scala/kafka/controller/KafkaController.scala @@ -18,7 +18,8 @@ package kafka.controller import java.util.concurrent.TimeUnit -import com.yammer.metrics.core.Gauge +import org.apache.kafka.common.requests.LeaderAndIsrResponse +import com.yammer.metrics.core.{Gauge, Meter} import kafka.admin.{AdminUtils, PreferredReplicaLeaderElectionCommand} import kafka.api._ import kafka.cluster.Broker @@ -52,6 +53,8 @@ class ControllerContext(val zkUtils: ZkUtils) { var partitionReplicaAssignment: mutable.Map[TopicAndPartition, Seq[Int]] = mutable.Map.empty var partitionLeadershipInfo: mutable.Map[TopicAndPartition, LeaderIsrAndControllerEpoch] = mutable.Map.empty val partitionsBeingReassigned: mutable.Map[TopicAndPartition, ReassignedPartitionsContext] = new mutable.HashMap + val partitionsUndergoingPreferredReplicaElection: mutable.Set[TopicAndPartition] = new mutable.HashSet + val replicasOnOfflineDisks: mutable.Map[Int, Set[TopicAndPartition]] = mutable.HashMap.empty private var liveBrokersUnderlying: Set[Broker] = Set.empty private var liveBrokerIdsUnderlying: Set[Int] = Set.empty @@ -75,6 +78,14 @@ class ControllerContext(val zkUtils: ZkUtils) { }.toSet } + def isReplicaOnline(brokerId: Int, topicAndPartition: TopicAndPartition, includeShuttingDownBrokers: Boolean = false): Boolean = { + val brokerOnline = { + if (includeShuttingDownBrokers) liveOrShuttingDownBrokerIds.contains(brokerId) + else liveBrokerIds.contains(brokerId) + } + brokerOnline && !replicasOnOfflineDisks.getOrElse(brokerId, Set.empty).contains(topicAndPartition) + } + def replicasOnBrokers(brokerIds: Set[Int]): Set[PartitionAndReplica] = { brokerIds.flatMap { brokerId => partitionReplicaAssignment.collect { @@ -98,7 +109,8 @@ class ControllerContext(val zkUtils: ZkUtils) { partitionReplicaAssignment.keySet.filter(topicAndPartition => topicAndPartition.topic == topic) def allLiveReplicas(): Set[PartitionAndReplica] = { - replicasOnBrokers(liveBrokerIds) + replicasOnBrokers(liveBrokerIds).filter{partitionAndReplica => + isReplicaOnline(partitionAndReplica.replica, TopicAndPartition(partitionAndReplica.topic, partitionAndReplica.partition))} } def replicasForPartition(partitions: collection.Set[TopicAndPartition]): collection.Set[PartitionAndReplica] = { @@ -150,6 +162,11 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met this.logIdent = "[Controller " + config.brokerId + "]: " private val stateChangeLogger = KafkaController.stateChangeLogger val controllerContext = new ControllerContext(zkUtils) + + // visible for testing + private[controller] val eventManager = new ControllerEventManager(controllerContext.stats.rateAndTimeMetrics, + _ => updateMetrics()) + val partitionStateMachine = new PartitionStateMachine(this) val replicaStateMachine = new ReplicaStateMachine(this) @@ -157,9 +174,6 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met // visible for testing private[controller] val kafkaScheduler = new KafkaScheduler(1) - // visible for testing - private[controller] val eventManager = new ControllerEventManager(controllerContext.stats.rateAndTimeMetrics, - _ => updateMetrics()) val topicDeletionManager = new TopicDeletionManager(this, eventManager) val offlinePartitionSelector = new OfflinePartitionLeaderSelector(controllerContext, config) @@ -175,6 +189,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met private val partitionReassignmentListener = new PartitionReassignmentListener(this, eventManager) private val preferredReplicaElectionListener = new PreferredReplicaElectionListener(this, eventManager) private val isrChangeNotificationListener = new IsrChangeNotificationListener(this, eventManager) + private val logDirEventNotificationListener = new LogDirEventNotificationListener(this, eventManager) @volatile private var activeControllerId = -1 @volatile private var offlinePartitionCount = 0 @@ -248,6 +263,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met info("Broker %d starting become controller state transition".format(config.brokerId)) readControllerEpochFromZookeeper() incrementControllerEpoch() + LogDirUtils.deleteLogDirEvents(zkUtils) // before reading source of truth from zookeeper, register the listeners to get broker/topic callbacks registerPartitionReassignmentListener() @@ -256,6 +272,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met registerTopicChangeListener() registerTopicDeletionListener() registerBrokerChangeListener() + registerLogDirEventNotificationListener() initializeControllerContext() val (topicsToBeDeleted, topicsIneligibleForDeletion) = fetchTopicDeletionsInProgress() @@ -299,6 +316,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met deregisterIsrChangeNotificationListener() deregisterPartitionReassignmentListener() deregisterPreferredReplicaElectionListener() + deregisterLogDirEventNotificationListener() // reset topic deletion manager topicDeletionManager.reset() @@ -329,6 +347,12 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met */ def isActive: Boolean = activeControllerId == config.brokerId + def onBrokerLogDirFailure(brokerIds: Seq[Int]) { + // send LeaderAndIsrRequest for all live replicas on those brokers to see if they are still online. + val replicasOnBrokers = controllerContext.replicasOnBrokers(brokerIds.toSet) + replicaStateMachine.handleStateChanges(replicasOnBrokers, OnlineReplica) + } + /** * This callback is invoked by the replica state machine's broker change listener, with the list of newly started * brokers as input. It does the following - @@ -345,6 +369,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met */ def onBrokerStartup(newBrokers: Seq[Int]) { info("New broker startup callback for %s".format(newBrokers.mkString(","))) + newBrokers.foreach(controllerContext.replicasOnOfflineDisks.remove) val newBrokersSet = newBrokers.toSet // send update metadata request to all live and shutting down brokers. Old brokers will get to know of the new // broker via this update. @@ -374,46 +399,52 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met } } - /** - * This callback is invoked by the replica state machine's broker change listener with the list of failed brokers - * as input. It does the following - - * 1. Mark partitions with dead leaders as offline - * 2. Triggers the OnlinePartition state change for all new/offline partitions - * 3. Invokes the OfflineReplica state change on the input list of newly started brokers - * 4. If no partitions are effected then send UpdateMetadataRequest to live or shutting down brokers - * - * Note that we don't need to refresh the leader/isr cache for all topic/partitions at this point. This is because - * the partition state machine will refresh our cache for us when performing leader election for all new/offline - * partitions coming online. - */ def onBrokerFailure(deadBrokers: Seq[Int]) { info("Broker failure callback for %s".format(deadBrokers.mkString(","))) + deadBrokers.foreach(controllerContext.replicasOnOfflineDisks.remove) val deadBrokersThatWereShuttingDown = deadBrokers.filter(id => controllerContext.shuttingDownBrokerIds.remove(id)) info("Removed %s from list of shutting down brokers.".format(deadBrokersThatWereShuttingDown)) - val deadBrokersSet = deadBrokers.toSet - // trigger OfflinePartition state for all partitions whose current leader is one amongst the dead brokers + val allReplicasOnDeadBrokers = controllerContext.replicasOnBrokers(deadBrokers.toSet) + onReplicaBecomeOffline(allReplicasOnDeadBrokers) + } + + /** + * This callback is invoked by the replica state machine's broker change listener with the list of failed brokers + * as input. It does the following - + * 1. Mark partitions with dead leaders as offline + * 2. Triggers the OnlinePartition state change for all new/offline partitions + * 3. Invokes the OfflineReplica state change on the input list of newly started brokers + * 4. If no partitions are effected then send UpdateMetadataRequest to live or shutting down brokers + * + * Note that we don't need to refresh the leader/isr cache for all topic/partitions at this point. This is because + * the partition state machine will refresh our cache for us when performing leader election for all new/offline + * partitions coming online. + */ + def onReplicaBecomeOffline(newOfflineReplicas: Set[PartitionAndReplica]): Unit = { + val (newOfflineReplicasForDeletion, newOfflineReplicasNotForDeletion) = + newOfflineReplicas.partition(p => topicDeletionManager.isTopicQueuedUpForDeletion(p.topic)) + val partitionsWithoutLeader = controllerContext.partitionLeadershipInfo.filter(partitionAndLeader => - deadBrokersSet.contains(partitionAndLeader._2.leaderAndIsr.leader) && + !controllerContext.isReplicaOnline(partitionAndLeader._2.leaderAndIsr.leader, partitionAndLeader._1) && !topicDeletionManager.isTopicQueuedUpForDeletion(partitionAndLeader._1.topic)).keySet + + // trigger OfflinePartition state for all partitions whose current leader is one amongst the newOfflineReplicas partitionStateMachine.handleStateChanges(partitionsWithoutLeader, OfflinePartition) // trigger OnlinePartition state changes for offline or new partitions partitionStateMachine.triggerOnlinePartitionStateChange() - // filter out the replicas that belong to topics that are being deleted - var allReplicasOnDeadBrokers = controllerContext.replicasOnBrokers(deadBrokersSet) - val activeReplicasOnDeadBrokers = allReplicasOnDeadBrokers.filterNot(p => topicDeletionManager.isTopicQueuedUpForDeletion(p.topic)) - // handle dead replicas - replicaStateMachine.handleStateChanges(activeReplicasOnDeadBrokers, OfflineReplica) - // check if topic deletion state for the dead replicas needs to be updated - val replicasForTopicsToBeDeleted = allReplicasOnDeadBrokers.filter(p => topicDeletionManager.isTopicQueuedUpForDeletion(p.topic)) - if(replicasForTopicsToBeDeleted.nonEmpty) { + // trigger OfflineReplica state change for those newly-discovered offline replicas + replicaStateMachine.handleStateChanges(newOfflineReplicasNotForDeletion, OfflineReplica) + + // fail deletion of topics that affected by the offline replicas + if (newOfflineReplicasForDeletion.nonEmpty) { // it is required to mark the respective replicas in TopicDeletionFailed state since the replica cannot be - // deleted when the broker is down. This will prevent the replica from being in TopicDeletionStarted state indefinitely + // deleted when its disk is down. This will prevent the replica from being in TopicDeletionStarted state indefinitely // since topic deletion cannot be retried until at least one replica is in TopicDeletionStarted state - topicDeletionManager.failReplicaDeletion(replicasForTopicsToBeDeleted) + topicDeletionManager.failReplicaDeletion(newOfflineReplicasForDeletion) } - // If broker failure did not require leader re-election, inform brokers of failed broker + // If replica failure did not require leader re-election, inform brokers of the offline replica // Note that during leader re-election, brokers update their metadata if (partitionsWithoutLeader.isEmpty) { sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq) @@ -724,10 +755,11 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met private def fetchTopicDeletionsInProgress(): (Set[String], Set[String]) = { val topicsToBeDeleted = zkUtils.getChildrenParentMayNotExist(ZkUtils.DeleteTopicsPath).toSet - val topicsWithReplicasOnDeadBrokers = controllerContext.partitionReplicaAssignment.filter { case (_, replicas) => - replicas.exists(r => !controllerContext.liveBrokerIds.contains(r)) }.keySet.map(_.topic) + val topicsWithOfflineReplicas = controllerContext.partitionReplicaAssignment.filter { case (partition, replicas) => + replicas.exists(r => !controllerContext.isReplicaOnline(r, partition)) + }.keySet.map(_.topic) val topicsForWhichPartitionReassignmentIsInProgress = controllerContext.partitionsBeingReassigned.keySet.map(_.topic) - val topicsIneligibleForDeletion = topicsWithReplicasOnDeadBrokers | topicsForWhichPartitionReassignmentIsInProgress + val topicsIneligibleForDeletion = topicsWithOfflineReplicas | topicsForWhichPartitionReassignmentIsInProgress info("List of topics to be deleted: %s".format(topicsToBeDeleted.mkString(","))) info("List of topics ineligible for deletion: %s".format(topicsIneligibleForDeletion.mkString(","))) (topicsToBeDeleted, topicsIneligibleForDeletion) @@ -771,7 +803,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met partitionStateMachine.handleStateChanges(Set(topicAndPartition), OnlinePartition, reassignedPartitionLeaderSelector) } else { // check if the leader is alive or not - if (controllerContext.liveBrokerIds.contains(currentLeader)) { + if (controllerContext.isReplicaOnline(currentLeader, topicAndPartition)) { info("Leader %s for partition %s being reassigned, ".format(currentLeader, topicAndPartition) + "is already in the new list of replicas %s and is alive".format(reassignedReplicas.mkString(","))) // shrink replication factor and update the leader epoch in zookeeper to use on the next LeaderAndIsrRequest @@ -909,6 +941,16 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met } } + private def registerLogDirEventNotificationListener() = { + debug("Registering logDirEventNotificationListener") + zkUtils.zkClient.subscribeChildChanges(ZkUtils.LogDirEventNotificationPath, logDirEventNotificationListener) + } + + private def deregisterLogDirEventNotificationListener() = { + debug("De-registering logDirEventNotificationListener") + zkUtils.zkClient.unsubscribeChildChanges(ZkUtils.LogDirEventNotificationPath, logDirEventNotificationListener) + } + private def readControllerEpochFromZookeeper() { // initialize the controller epoch and zk version by reading from zookeeper if(controllerContext.zkUtils.pathExists(ZkUtils.ControllerEpochPath)) { @@ -1119,7 +1161,7 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met topicsNotInPreferredReplica.keys.foreach { topicPartition => // do this check only if the broker is live and there are no partitions being reassigned currently // and preferred replica election is not in progress - if (controllerContext.liveBrokerIds.contains(leaderBroker) && + if (controllerContext.isReplicaOnline(leaderBroker, topicPartition) && controllerContext.partitionsBeingReassigned.isEmpty && !topicDeletionManager.isTopicQueuedUpForDeletion(topicPartition.topic) && controllerContext.allTopics.contains(topicPartition.topic)) { @@ -1352,6 +1394,22 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met } + case class LogDirEventNotification(sequenceNumbers: Seq[String]) extends ControllerEvent { + + def state = ControllerState.LogDirChange + + override def process(): Unit = { + val zkUtils = controllerContext.zkUtils + try { + val brokerIds = sequenceNumbers.flatMap(LogDirUtils.getBrokerIdFromLogDirEvent(zkUtils, _)) + onBrokerLogDirFailure(brokerIds) + } finally { + // delete processed children + sequenceNumbers.map(x => zkUtils.deletePath(ZkUtils.LogDirEventNotificationPath + "/" + x)) + } + } + } + case class PreferredReplicaLeaderElection(partitions: Set[TopicAndPartition]) extends ControllerEvent { def state = ControllerState.ManualLeaderBalance @@ -1447,7 +1505,35 @@ class KafkaController(val config: KafkaConfig, zkUtils: ZkUtils, time: Time, met } } - case class TopicDeletionStopReplicaResult(stopReplicaResponseObj: AbstractResponse, replicaId: Int) extends ControllerEvent { + case class LeaderAndIsrResponseReceived(LeaderAndIsrResponseObj: AbstractResponse, brokerId: Int) extends ControllerEvent { + + def state = ControllerState.LeaderAndIsrResponseReceived + + override def process(): Unit = { + import JavaConverters._ + val leaderAndIsrResponse = LeaderAndIsrResponseObj.asInstanceOf[LeaderAndIsrResponse] + + if (leaderAndIsrResponse.error() != Errors.NONE) { + stateChangeLogger.error(s"Received error in leaderAndIsrResponse $leaderAndIsrResponse from broker $brokerId") + return + } + + val offlineReplicas = leaderAndIsrResponse.responses().asScala.filter(_._2 == Errors.KAFKA_STORAGE_ERROR).keys.map( + tp => TopicAndPartition(tp.topic(), tp.partition())).toSet + val onlineReplicas = leaderAndIsrResponse.responses().asScala.filter(_._2 == Errors.NONE).keys.map( + tp => TopicAndPartition(tp.topic(), tp.partition())).toSet + val previousOfflineReplicas = controllerContext.replicasOnOfflineDisks.getOrElse(brokerId, Set.empty[TopicAndPartition]) + val currentOfflineReplicas = previousOfflineReplicas -- onlineReplicas ++ offlineReplicas + controllerContext.replicasOnOfflineDisks.put(brokerId, currentOfflineReplicas) + val newOfflineReplicas = (currentOfflineReplicas -- previousOfflineReplicas).map(tp => PartitionAndReplica(tp.topic, tp.partition, brokerId)) + stateChangeLogger.info(s"Mark replicas ${currentOfflineReplicas -- previousOfflineReplicas} on broker $brokerId as offline") + + if (newOfflineReplicas.nonEmpty) + onReplicaBecomeOffline(newOfflineReplicas) + } + } + + case class TopicDeletionStopReplicaResponseReceived(stopReplicaResponseObj: AbstractResponse, replicaId: Int) extends ControllerEvent { def state = ControllerState.TopicDeletion @@ -1607,6 +1693,20 @@ class TopicChangeListener(controller: KafkaController, eventManager: ControllerE } } +/** + * Called when broker notifies controller of disk change + */ +class LogDirEventNotificationListener(controller: KafkaController, eventManager: ControllerEventManager) extends IZkChildListener with Logging { + override def handleChildChange(parentPath: String, currentChilds: java.util.List[String]): Unit = { + import JavaConverters._ + eventManager.put(controller.LogDirEventNotification(currentChilds.asScala)) + } +} + +object LogDirEventNotificationListener { + val version: Long = 1L +} + class PartitionModificationsListener(controller: KafkaController, eventManager: ControllerEventManager, topic: String) extends IZkDataListener with Logging { override def handleDataChange(dataPath: String, data: Any): Unit = { eventManager.put(controller.PartitionModifications(topic)) diff --git a/core/src/main/scala/kafka/controller/PartitionLeaderSelector.scala b/core/src/main/scala/kafka/controller/PartitionLeaderSelector.scala index 54bbb8938f9d8..a66a9933b70e0 100644 --- a/core/src/main/scala/kafka/controller/PartitionLeaderSelector.scala +++ b/core/src/main/scala/kafka/controller/PartitionLeaderSelector.scala @@ -54,8 +54,8 @@ class OfflinePartitionLeaderSelector(controllerContext: ControllerContext, confi def selectLeader(topicAndPartition: TopicAndPartition, currentLeaderAndIsr: LeaderAndIsr): (LeaderAndIsr, Seq[Int]) = { controllerContext.partitionReplicaAssignment.get(topicAndPartition) match { case Some(assignedReplicas) => - val liveAssignedReplicas = assignedReplicas.filter(r => controllerContext.liveBrokerIds.contains(r)) - val liveBrokersInIsr = currentLeaderAndIsr.isr.filter(r => controllerContext.liveBrokerIds.contains(r)) + val liveAssignedReplicas = assignedReplicas.filter(r => controllerContext.isReplicaOnline(r, topicAndPartition)) + val liveBrokersInIsr = currentLeaderAndIsr.isr.filter(r => controllerContext.isReplicaOnline(r, topicAndPartition)) val newLeaderAndIsr = if (liveBrokersInIsr.isEmpty) { // Prior to electing an unclean (i.e. non-ISR) leader, ensure that doing so is not disallowed by the configuration @@ -111,7 +111,7 @@ class ReassignedPartitionLeaderSelector(controllerContext: ControllerContext) ex currentLeaderAndIsr: LeaderAndIsr): (LeaderAndIsr, Seq[Int]) = { val reassignedInSyncReplicas = controllerContext.partitionsBeingReassigned(topicAndPartition).newReplicas val newLeaderOpt = reassignedInSyncReplicas.find { r => - controllerContext.liveBrokerIds.contains(r) && currentLeaderAndIsr.isr.contains(r) + controllerContext.isReplicaOnline(r, topicAndPartition) && currentLeaderAndIsr.isr.contains(r) } newLeaderOpt match { case Some(newLeader) => (currentLeaderAndIsr.newLeader(newLeader), reassignedInSyncReplicas) @@ -174,8 +174,7 @@ class ControlledShutdownLeaderSelector(controllerContext: ControllerContext) ext currentLeaderAndIsr: LeaderAndIsr): (LeaderAndIsr, Seq[Int]) = { val currentIsr = currentLeaderAndIsr.isr val assignedReplicas = controllerContext.partitionReplicaAssignment(topicAndPartition) - val liveOrShuttingDownBrokerIds = controllerContext.liveOrShuttingDownBrokerIds - val liveAssignedReplicas = assignedReplicas.filter(r => liveOrShuttingDownBrokerIds.contains(r)) + val liveAssignedReplicas = assignedReplicas.filter(r => controllerContext.isReplicaOnline(r, topicAndPartition, true)) val newIsr = currentIsr.filter(brokerId => !controllerContext.shuttingDownBrokerIds.contains(brokerId)) liveAssignedReplicas.find(newIsr.contains) match { diff --git a/core/src/main/scala/kafka/controller/PartitionStateMachine.scala b/core/src/main/scala/kafka/controller/PartitionStateMachine.scala index 5751e17de0c05..d74163274b796 100755 --- a/core/src/main/scala/kafka/controller/PartitionStateMachine.scala +++ b/core/src/main/scala/kafka/controller/PartitionStateMachine.scala @@ -200,7 +200,7 @@ class PartitionStateMachine(controller: KafkaController) extends Logging { controllerContext.partitionLeadershipInfo.get(topicPartition) match { case Some(currentLeaderIsrAndEpoch) => // else, check if the leader for partition is alive. If yes, it is in Online state, else it is in Offline state - if (controllerContext.liveBrokerIds.contains(currentLeaderIsrAndEpoch.leaderAndIsr.leader)) + if (controllerContext.isReplicaOnline(currentLeaderIsrAndEpoch.leaderAndIsr.leader, topicPartition)) // leader is alive partitionState.put(topicPartition, OnlinePartition) else @@ -227,7 +227,7 @@ class PartitionStateMachine(controller: KafkaController) extends Logging { */ private def initializeLeaderAndIsrForPartition(topicAndPartition: TopicAndPartition) = { val replicaAssignment = controllerContext.partitionReplicaAssignment(topicAndPartition).toList - val liveAssignedReplicas = replicaAssignment.filter(controllerContext.liveBrokerIds.contains) + val liveAssignedReplicas = replicaAssignment.filter(r => controllerContext.isReplicaOnline(r, topicAndPartition)) liveAssignedReplicas.headOption match { case None => val failMsg = s"Controller $controllerId epoch ${controller.epoch} encountered error during state change of " + @@ -259,7 +259,9 @@ class PartitionStateMachine(controller: KafkaController) extends Logging { topicAndPartition.topic, topicAndPartition.partition, leaderIsrAndControllerEpoch, - replicaAssignment + replicaAssignment, + callback = null, + isNew = true ) } catch { case _: ZkNodeExistsException => diff --git a/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala b/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala index 43fac19e06248..a1bd5859943fe 100755 --- a/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala +++ b/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala @@ -149,7 +149,7 @@ class ReplicaStateMachine(controller: KafkaController) extends Logging { .format(replicaId, topicAndPartition) + "state as it is being requested to become leader") brokerRequestBatch.addLeaderAndIsrRequestForBrokers(List(replicaId), topic, partition, leaderIsrAndControllerEpoch, - replicaAssignment) + replicaAssignment, callback = null, isNew = true) case None => // new leader request will be sent to this replica when one gets elected } replicaState.put(partitionAndReplica, NewReplica) @@ -283,7 +283,7 @@ class ReplicaStateMachine(controller: KafkaController) extends Logging { val partition = topicPartition.partition assignedReplicas.foreach { replicaId => val partitionAndReplica = PartitionAndReplica(topic, partition, replicaId) - if (controllerContext.liveBrokerIds.contains(replicaId)) + if (controllerContext.isReplicaOnline(replicaId, topicPartition)) replicaState.put(partitionAndReplica, OnlineReplica) else // mark replicas on dead brokers as failed for topic deletion, if they belong to a topic to be deleted. diff --git a/core/src/main/scala/kafka/controller/TopicDeletionManager.scala b/core/src/main/scala/kafka/controller/TopicDeletionManager.scala index e483ac2331769..325488eb20228 100755 --- a/core/src/main/scala/kafka/controller/TopicDeletionManager.scala +++ b/core/src/main/scala/kafka/controller/TopicDeletionManager.scala @@ -299,7 +299,7 @@ class TopicDeletionManager(controller: KafkaController, eventManager: Controller debug("Deletion started for replicas %s".format(replicasForDeletionRetry.mkString(","))) controller.replicaStateMachine.handleStateChanges(replicasForDeletionRetry, ReplicaDeletionStarted, new Callbacks.CallbackBuilder().stopReplicaCallback((stopReplicaResponseObj, replicaId) => - eventManager.put(controller.TopicDeletionStopReplicaResult(stopReplicaResponseObj, replicaId))).build) + eventManager.put(controller.TopicDeletionStopReplicaResponseReceived(stopReplicaResponseObj, replicaId))).build) if (deadReplicasForTopic.nonEmpty) { debug("Dead Replicas (%s) found for topic %s".format(deadReplicasForTopic.mkString(","), topic)) markTopicIneligibleForDeletion(Set(topic)) diff --git a/core/src/main/scala/kafka/coordinator/group/GroupMetadataManager.scala b/core/src/main/scala/kafka/coordinator/group/GroupMetadataManager.scala index 9322ff26d6ea1..774860f1c6634 100644 --- a/core/src/main/scala/kafka/coordinator/group/GroupMetadataManager.scala +++ b/core/src/main/scala/kafka/coordinator/group/GroupMetadataManager.scala @@ -17,7 +17,7 @@ package kafka.coordinator.group -import java.io.PrintStream +import java.io.{IOException, PrintStream} import java.nio.ByteBuffer import java.nio.charset.StandardCharsets import java.util.concurrent.TimeUnit @@ -455,6 +455,9 @@ class GroupMetadataManager(brokerId: Int, loadGroupsAndOffsets(topicPartition, onGroupLoaded) info(s"Finished loading offsets and group metadata from $topicPartition in ${time.milliseconds() - startMs} milliseconds.") } catch { + case e: IOException => + replicaManager.getLogDir(topicPartition).foreach(replicaManager.maybeAddLogFailureEvent) + error(s"Error loading offsets from $topicPartition", e) case t: Throwable => error(s"Error loading offsets from $topicPartition", t) } finally { inLock(partitionLock) { diff --git a/core/src/main/scala/kafka/coordinator/transaction/TransactionCoordinator.scala b/core/src/main/scala/kafka/coordinator/transaction/TransactionCoordinator.scala index 85c19c53da5aa..e201e9145f15d 100644 --- a/core/src/main/scala/kafka/coordinator/transaction/TransactionCoordinator.scala +++ b/core/src/main/scala/kafka/coordinator/transaction/TransactionCoordinator.scala @@ -266,7 +266,7 @@ class TransactionCoordinator(brokerId: Int, } def handleTxnImmigration(txnTopicPartitionId: Int, coordinatorEpoch: Int) { - txnManager.loadTransactionsForTxnTopicPartition(txnTopicPartitionId, coordinatorEpoch, txnMarkerChannelManager.addTxnMarkersToSend) + txnManager.loadTransactionsForTxnTopicPartition(txnTopicPartitionId, coordinatorEpoch, txnMarkerChannelManager.addTxnMarkersToSend) } def handleTxnEmigration(txnTopicPartitionId: Int, coordinatorEpoch: Int) { diff --git a/core/src/main/scala/kafka/log/AbstractIndex.scala b/core/src/main/scala/kafka/log/AbstractIndex.scala index bfc682855652d..d569ad9def992 100644 --- a/core/src/main/scala/kafka/log/AbstractIndex.scala +++ b/core/src/main/scala/kafka/log/AbstractIndex.scala @@ -178,6 +178,12 @@ abstract class AbstractIndex[K, V](@volatile var file: File, val baseOffset: Lon trimToValidSize() } + def closeHandler() = { + // File handler of the index field will be closed after the mmap is garbage collected + CoreUtils.swallow(forceUnmap(mmap)) + mmap = null + } + /** * Do a basic sanity check on this index to detect obvious problems * diff --git a/core/src/main/scala/kafka/log/Log.scala b/core/src/main/scala/kafka/log/Log.scala index 824d302b5b2ac..f5db9dcb43320 100644 --- a/core/src/main/scala/kafka/log/Log.scala +++ b/core/src/main/scala/kafka/log/Log.scala @@ -531,6 +531,16 @@ class Log(@volatile var dir: File, } } + /** + * Close file handlers used by log but don't write to disk. This is used when the disk may have failed + */ + def closeHandlers() { + debug(s"Closing handlers of log $name") + lock synchronized { + logSegments.foreach(_.closeHandlers()) + } + } + /** * Append this message set to the active segment of the log, assigning offsets and Partition Leader Epochs * @param records The records to append diff --git a/core/src/main/scala/kafka/log/LogCleaner.scala b/core/src/main/scala/kafka/log/LogCleaner.scala index 4898d1129cfa7..834683b2e89ca 100644 --- a/core/src/main/scala/kafka/log/LogCleaner.scala +++ b/core/src/main/scala/kafka/log/LogCleaner.scala @@ -5,7 +5,7 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -17,7 +17,7 @@ package kafka.log -import java.io.File +import java.io.{File, IOException} import java.nio._ import java.util.Date import java.util.concurrent.{CountDownLatch, TimeUnit} @@ -25,6 +25,7 @@ import java.util.concurrent.{CountDownLatch, TimeUnit} import com.yammer.metrics.core.Gauge import kafka.common._ import kafka.metrics.KafkaMetricsGroup +import kafka.server.LogDirFailureChannel import kafka.utils._ import org.apache.kafka.common.record._ import org.apache.kafka.common.utils.Time @@ -38,29 +39,29 @@ import scala.collection.JavaConverters._ /** * The cleaner is responsible for removing obsolete records from logs which have the "compact" retention strategy. * A message with key K and offset O is obsolete if there exists a message with key K and offset O' such that O < O'. - * + * * Each log can be thought of being split into two sections of segments: a "clean" section which has previously been cleaned followed by a * "dirty" section that has not yet been cleaned. The dirty section is further divided into the "cleanable" section followed by an "uncleanable" section. * The uncleanable section is excluded from cleaning. The active log segment is always uncleanable. If there is a * compaction lag time set, segments whose largest message timestamp is within the compaction lag time of the cleaning operation are also uncleanable. * * The cleaning is carried out by a pool of background threads. Each thread chooses the dirtiest log that has the "compact" retention policy - * and cleans that. The dirtiness of the log is guessed by taking the ratio of bytes in the dirty section of the log to the total bytes in the log. - * + * and cleans that. The dirtiness of the log is guessed by taking the ratio of bytes in the dirty section of the log to the total bytes in the log. + * * To clean a log the cleaner first builds a mapping of key=>last_offset for the dirty section of the log. See kafka.log.OffsetMap for details of - * the implementation of the mapping. - * - * Once the key=>offset map is built, the log is cleaned by recopying each log segment but omitting any key that appears in the offset map with a + * the implementation of the mapping. + * + * Once the key=>offset map is built, the log is cleaned by recopying each log segment but omitting any key that appears in the offset map with a * higher offset than what is found in the segment (i.e. messages with a key that appears in the dirty section of the log). - * + * * To avoid segments shrinking to very small sizes with repeated cleanings we implement a rule by which if we will merge successive segments when * doing a cleaning if their log and index size are less than the maximum log and index size prior to the clean beginning. - * + * * Cleaned segments are swapped into the log as they become available. - * + * * One nuance that the cleaner must handle is log truncation. If a log is truncated while it is being cleaned the cleaning of that log is aborted. - * - * Messages with null payload are treated as deletes for the purpose of log compaction. This means that they receive special treatment by the cleaner. + * + * Messages with null payload are treated as deletes for the purpose of log compaction. This means that they receive special treatment by the cleaner. * The cleaner will only retain delete records for a period of time to avoid accumulating space indefinitely. This period of time is configurable on a per-topic * basis and is measured from the time the segment enters the clean portion of the log (at which point any prior message with that key has been removed). * Delete markers in the clean section of the log that are older than this time will not be retained when log segments are being recopied as part of cleaning. @@ -86,31 +87,32 @@ import scala.collection.JavaConverters._ * @param time A way to control the passage of time */ class LogCleaner(val config: CleanerConfig, - val logDirs: Array[File], + val logDirs: Seq[File], val logs: Pool[TopicPartition, Log], + val logDirFailureChannel: LogDirFailureChannel, time: Time = Time.SYSTEM) extends Logging with KafkaMetricsGroup { - + /* for managing the state of partitions being cleaned. package-private to allow access in tests */ - private[log] val cleanerManager = new LogCleanerManager(logDirs, logs) + private[log] val cleanerManager = new LogCleanerManager(logDirs, logs, logDirFailureChannel) /* a throttle used to limit the I/O of all the cleaner threads to a user-specified maximum rate */ - private val throttler = new Throttler(desiredRatePerSec = config.maxIoBytesPerSecond, - checkIntervalMs = 300, - throttleDown = true, + private val throttler = new Throttler(desiredRatePerSec = config.maxIoBytesPerSecond, + checkIntervalMs = 300, + throttleDown = true, "cleaner-io", "bytes", time = time) - + /* the threads */ private val cleaners = (0 until config.numThreads).map(new CleanerThread(_)) - + /* a metric to track the maximum utilization of any thread's buffer in the last cleaning */ - newGauge("max-buffer-utilization-percent", + newGauge("max-buffer-utilization-percent", new Gauge[Int] { def value: Int = cleaners.map(_.lastStats).map(100 * _.bufferUtilization).max.toInt }) /* a metric to track the recopy rate of each thread's last cleaning */ - newGauge("cleaner-recopy-percent", + newGauge("cleaner-recopy-percent", new Gauge[Int] { def value: Int = { val stats = cleaners.map(_.lastStats) @@ -123,7 +125,7 @@ class LogCleaner(val config: CleanerConfig, new Gauge[Int] { def value: Int = cleaners.map(_.lastStats).map(_.elapsedSecs).max.toInt }) - + /** * Start the background cleaning */ @@ -131,7 +133,7 @@ class LogCleaner(val config: CleanerConfig, info("Starting the log cleaner") cleaners.foreach(_.start()) } - + /** * Stop the background cleaning */ @@ -139,7 +141,7 @@ class LogCleaner(val config: CleanerConfig, info("Shutting down the log cleaner.") cleaners.foreach(_.shutdown()) } - + /** * Abort the cleaning of a particular partition, if it's in progress. This call blocks until the cleaning of * the partition is aborted. @@ -155,6 +157,10 @@ class LogCleaner(val config: CleanerConfig, cleanerManager.updateCheckpoints(dataDir, update=None) } + def handleLogDirFailure(dir: String) { + cleanerManager.handleLogDirFailure(dir) + } + /** * Truncate cleaner offset checkpoint for the given partition if its checkpointed offset is larger than the given offset */ @@ -197,21 +203,21 @@ class LogCleaner(val config: CleanerConfig, } isCleaned } - + /** * The cleaner threads do the actual log cleaning. Each thread processes does its cleaning repeatedly by * choosing the dirtiest log, cleaning it, and then swapping in the cleaned segments. */ private class CleanerThread(threadId: Int) extends ShutdownableThread(name = "kafka-log-cleaner-thread-" + threadId, isInterruptible = false) { - + override val loggerName = classOf[LogCleaner].getName - + if(config.dedupeBufferSize / config.numThreads > Int.MaxValue) warn("Cannot use more than 2G of cleaner buffer space per cleaner thread, ignoring excess buffer space...") val cleaner = new Cleaner(id = threadId, - offsetMap = new SkimpyOffsetMap(memory = math.min(config.dedupeBufferSize / config.numThreads, Int.MaxValue).toInt, + offsetMap = new SkimpyOffsetMap(memory = math.min(config.dedupeBufferSize / config.numThreads, Int.MaxValue).toInt, hashAlgorithm = config.hashAlgorithm), ioBufferSize = config.ioBufferSize / config.numThreads / 2, maxIoBufferSize = config.maxMessageSize, @@ -219,7 +225,7 @@ class LogCleaner(val config: CleanerConfig, throttler = throttler, time = time, checkDone = checkDone) - + @volatile var lastStats: CleanerStats = new CleanerStats() private val backOffWaitLatch = new CountDownLatch(1) @@ -241,7 +247,7 @@ class LogCleaner(val config: CleanerConfig, backOffWaitLatch.countDown() awaitShutdown() } - + /** * Clean a log if there is a dirty log available, otherwise sleep for a bit */ @@ -258,6 +264,9 @@ class LogCleaner(val config: CleanerConfig, endOffset = nextDirtyOffset } catch { case _: LogCleaningAbortedException => // task can be aborted, let it go. + case e: IOException => + error(s"Failed to cleanup log for ${cleanable.topicPartition} due to IOException", e) + logDirFailureChannel.maybeAddLogFailureEvent(cleanable.log.dir.getParent) } finally { cleanerManager.doneCleaning(cleanable.topicPartition, cleanable.log.dir.getParentFile, endOffset) } @@ -268,6 +277,10 @@ class LogCleaner(val config: CleanerConfig, case (topicPartition, log) => try { log.deleteOldSegments() + } catch { + case e: IOException => + error(s"Failed to delete old segments for $topicPartition due to IOException", e) + logDirFailureChannel.maybeAddLogFailureEvent(log.dir.getParent) } finally { cleanerManager.doneDeleting(topicPartition) } @@ -275,36 +288,36 @@ class LogCleaner(val config: CleanerConfig, if (!cleaned) backOffWaitLatch.await(config.backOffMs, TimeUnit.MILLISECONDS) } - + /** * Log out statistics on a single run of the cleaner. */ def recordStats(id: Int, name: String, from: Long, to: Long, stats: CleanerStats) { this.lastStats = stats def mb(bytes: Double) = bytes / (1024*1024) - val message = - "%n\tLog cleaner thread %d cleaned log %s (dirty section = [%d, %d])%n".format(id, name, from, to) + - "\t%,.1f MB of log processed in %,.1f seconds (%,.1f MB/sec).%n".format(mb(stats.bytesRead), - stats.elapsedSecs, - mb(stats.bytesRead/stats.elapsedSecs)) + - "\tIndexed %,.1f MB in %.1f seconds (%,.1f Mb/sec, %.1f%% of total time)%n".format(mb(stats.mapBytesRead), - stats.elapsedIndexSecs, - mb(stats.mapBytesRead)/stats.elapsedIndexSecs, + val message = + "%n\tLog cleaner thread %d cleaned log %s (dirty section = [%d, %d])%n".format(id, name, from, to) + + "\t%,.1f MB of log processed in %,.1f seconds (%,.1f MB/sec).%n".format(mb(stats.bytesRead), + stats.elapsedSecs, + mb(stats.bytesRead/stats.elapsedSecs)) + + "\tIndexed %,.1f MB in %.1f seconds (%,.1f Mb/sec, %.1f%% of total time)%n".format(mb(stats.mapBytesRead), + stats.elapsedIndexSecs, + mb(stats.mapBytesRead)/stats.elapsedIndexSecs, 100 * stats.elapsedIndexSecs/stats.elapsedSecs) + "\tBuffer utilization: %.1f%%%n".format(100 * stats.bufferUtilization) + - "\tCleaned %,.1f MB in %.1f seconds (%,.1f Mb/sec, %.1f%% of total time)%n".format(mb(stats.bytesRead), - stats.elapsedSecs - stats.elapsedIndexSecs, - mb(stats.bytesRead)/(stats.elapsedSecs - stats.elapsedIndexSecs), 100 * (stats.elapsedSecs - stats.elapsedIndexSecs).toDouble/stats.elapsedSecs) + + "\tCleaned %,.1f MB in %.1f seconds (%,.1f Mb/sec, %.1f%% of total time)%n".format(mb(stats.bytesRead), + stats.elapsedSecs - stats.elapsedIndexSecs, + mb(stats.bytesRead)/(stats.elapsedSecs - stats.elapsedIndexSecs), 100 * (stats.elapsedSecs - stats.elapsedIndexSecs).toDouble/stats.elapsedSecs) + "\tStart size: %,.1f MB (%,d messages)%n".format(mb(stats.bytesRead), stats.messagesRead) + - "\tEnd size: %,.1f MB (%,d messages)%n".format(mb(stats.bytesWritten), stats.messagesWritten) + - "\t%.1f%% size reduction (%.1f%% fewer messages)%n".format(100.0 * (1.0 - stats.bytesWritten.toDouble/stats.bytesRead), + "\tEnd size: %,.1f MB (%,d messages)%n".format(mb(stats.bytesWritten), stats.messagesWritten) + + "\t%.1f%% size reduction (%.1f%% fewer messages)%n".format(100.0 * (1.0 - stats.bytesWritten.toDouble/stats.bytesRead), 100.0 * (1.0 - stats.messagesWritten.toDouble/stats.messagesRead)) info(message) if (stats.invalidMessagesRead > 0) { warn("\tFound %d invalid messages during compaction.".format(stats.invalidMessagesRead)) } } - + } } @@ -327,14 +340,14 @@ private[log] class Cleaner(val id: Int, throttler: Throttler, time: Time, checkDone: (TopicPartition) => Unit) extends Logging { - + override val loggerName = classOf[LogCleaner].getName this.logIdent = "Cleaner " + id + ": " /* buffer used for read i/o */ private var readBuffer = ByteBuffer.allocate(ioBufferSize) - + /* buffer used for write i/o */ private var writeBuffer = ByteBuffer.allocate(ioBufferSize) @@ -352,7 +365,7 @@ private[log] class Cleaner(val id: Int, private[log] def clean(cleanable: LogToClean): (Long, CleanerStats) = { // figure out the timestamp below which it is safe to remove delete tombstones // this position is defined to be a configurable time beneath the last modified time of the last clean segment - val deleteHorizonMs = + val deleteHorizonMs = cleanable.log.logSegments(0, cleanable.firstDirtyOffset).lastOption match { case None => 0L case Some(seg) => seg.lastModified - cleanable.log.config.deleteRetentionMs diff --git a/core/src/main/scala/kafka/log/LogCleanerManager.scala b/core/src/main/scala/kafka/log/LogCleanerManager.scala index 4a4a59f993b38..ff2c4def0d312 100755 --- a/core/src/main/scala/kafka/log/LogCleanerManager.scala +++ b/core/src/main/scala/kafka/log/LogCleanerManager.scala @@ -17,13 +17,14 @@ package kafka.log -import java.io.File +import java.io.{File, IOException} import java.util.concurrent.TimeUnit import java.util.concurrent.locks.ReentrantLock import com.yammer.metrics.core.Gauge import kafka.common.LogCleaningAbortedException import kafka.metrics.KafkaMetricsGroup +import kafka.server.LogDirFailureChannel import kafka.server.checkpoints.OffsetCheckpointFile import kafka.utils.CoreUtils._ import kafka.utils.{Logging, Pool} @@ -45,7 +46,9 @@ private[log] case object LogCleaningPaused extends LogCleaningState * While a partition is in the LogCleaningPaused state, it won't be scheduled for cleaning again, until cleaning is * requested to be resumed. */ -private[log] class LogCleanerManager(val logDirs: Array[File], val logs: Pool[TopicPartition, Log]) extends Logging with KafkaMetricsGroup { +private[log] class LogCleanerManager(val logDirs: Seq[File], + val logs: Pool[TopicPartition, Log], + val logDirFailureChannel: LogDirFailureChannel) extends Logging with KafkaMetricsGroup { import LogCleanerManager._ @@ -53,19 +56,19 @@ private[log] class LogCleanerManager(val logDirs: Array[File], val logs: Pool[To // package-private for testing private[log] val offsetCheckpointFile = "cleaner-offset-checkpoint" - + /* the offset checkpoints holding the last cleaned point for each log */ - private val checkpoints = logDirs.map(dir => (dir, new OffsetCheckpointFile(new File(dir, offsetCheckpointFile)))).toMap + @volatile private var checkpoints = logDirs.map(dir => (dir, new OffsetCheckpointFile(new File(dir, offsetCheckpointFile)))).toMap /* the set of logs currently being cleaned */ private val inProgress = mutable.HashMap[TopicPartition, LogCleaningState]() /* a global lock used to control all access to the in-progress set and the offset checkpoints */ private val lock = new ReentrantLock - + /* for coordinating the pausing and the cleaning of a partition */ private val pausedCleaningCond = lock.newCondition() - + /* a gauge for tracking the cleanable ratio of the dirtiest log */ @volatile private var dirtiestLogCleanableRatio = 0.0 newGauge("max-dirty-percent", new Gauge[Int] { def value = (100 * dirtiestLogCleanableRatio).toInt }) @@ -77,8 +80,21 @@ private[log] class LogCleanerManager(val logDirs: Array[File], val logs: Pool[To /** * @return the position processed for all logs. */ - def allCleanerCheckpoints: Map[TopicPartition, Long] = - checkpoints.values.flatMap(_.read()).toMap + def allCleanerCheckpoints: Map[TopicPartition, Long] = { + inLock(lock) { + checkpoints.values.flatMap(checkpoint => { + try { + checkpoint.read() + } catch { + case e: IOException => + error(s"Failed to access checkpoint file ${checkpoint.f}", e) + logDirFailureChannel.maybeAddLogFailureEvent(checkpoint.f.getParentFile.getAbsolutePath) + Map.empty[TopicPartition, Long] + } + }).toMap + } + } + /** * Choose the log to clean next and add it to the in-progress set. We recompute this @@ -217,8 +233,23 @@ private[log] class LogCleanerManager(val logDirs: Array[File], val logs: Pool[To def updateCheckpoints(dataDir: File, update: Option[(TopicPartition,Long)]) { inLock(lock) { val checkpoint = checkpoints(dataDir) - val existing = checkpoint.read().filterKeys(logs.keys) ++ update - checkpoint.write(existing) + if (checkpoint != null) { + try { + val existing = checkpoint.read().filterKeys(logs.keys) ++ update + checkpoint.write(existing) + } catch { + case e: IOException => + error(s"Failed to access checkpoint file ${checkpoint.f}", e) + logDirFailureChannel.maybeAddLogFailureEvent(checkpoint.f.getParentFile.getAbsolutePath) + } + } + } + } + + def handleLogDirFailure(dir: String) { + info(s"Stopping cleaning logs in dir $dir") + inLock(lock) { + checkpoints = checkpoints.filterKeys(_.getAbsolutePath != dir) } } @@ -226,10 +257,18 @@ private[log] class LogCleanerManager(val logDirs: Array[File], val logs: Pool[To inLock(lock) { if (logs.get(topicPartition).config.compact) { val checkpoint = checkpoints(dataDir) - val existing = checkpoint.read() - if (existing.getOrElse(topicPartition, 0L) > offset) - checkpoint.write(existing + (topicPartition -> offset)) + if (checkpoint != null) { + try { + val existing = checkpoint.read() + if (existing.getOrElse(topicPartition, 0L) > offset) + checkpoint.write(existing + (topicPartition -> offset)) + } catch { + case e: IOException => + error(s"Failed to access checkpoint file ${checkpoint.f}", e) + logDirFailureChannel.maybeAddLogFailureEvent(checkpoint.f.getParentFile.getAbsolutePath) + } + } } } } @@ -241,7 +280,7 @@ private[log] class LogCleanerManager(val logDirs: Array[File], val logs: Pool[To inLock(lock) { inProgress(topicPartition) match { case LogCleaningInProgress => - updateCheckpoints(dataDir,Option(topicPartition, endOffset)) + updateCheckpoints(dataDir, Option(topicPartition, endOffset)) inProgress.remove(topicPartition) case LogCleaningAborted => inProgress.put(topicPartition, LogCleaningPaused) diff --git a/core/src/main/scala/kafka/log/LogManager.scala b/core/src/main/scala/kafka/log/LogManager.scala index 2df52414e3f4b..bd489a93f1dee 100755 --- a/core/src/main/scala/kafka/log/LogManager.scala +++ b/core/src/main/scala/kafka/log/LogManager.scala @@ -21,8 +21,10 @@ import java.io._ import java.nio.file.Files import java.util.concurrent._ +import com.yammer.metrics.core.Gauge import kafka.admin.AdminUtils import kafka.common.{KafkaException, KafkaStorageException} +import kafka.metrics.KafkaMetricsGroup import kafka.server.checkpoints.OffsetCheckpointFile import kafka.server.{BrokerState, RecoveringFromUncleanShutdown, _} import kafka.utils._ @@ -31,19 +33,21 @@ import org.apache.kafka.common.utils.Time import scala.collection.JavaConverters._ import scala.collection._ +import scala.collection.mutable.ArrayBuffer /** * The entry point to the kafka log management subsystem. The log manager is responsible for log creation, retrieval, and cleaning. * All read and write operations are delegated to the individual log instances. - * + * * The log manager maintains logs in one or more directories. New logs are created in the data directory * with the fewest logs. No attempt is made to move partitions after the fact or balance based on * size or I/O rate. - * + * * A background thread handles log retention by periodically truncating excess log segments. */ @threadsafe -class LogManager(val logDirs: Array[File], +class LogManager(private val logDirs: Array[File], + private val initialOfflineDirs: Array[File], val topicConfigs: Map[String, LogConfig], // note that this doesn't get updated after creation val defaultConfig: LogConfig, val cleanerConfig: CleanerConfig, @@ -56,7 +60,8 @@ class LogManager(val logDirs: Array[File], scheduler: Scheduler, val brokerState: BrokerState, brokerTopicStats: BrokerTopicStats, - time: Time) extends Logging { + logDirFailureChannel: LogDirFailureChannel, + time: Time) extends Logging with KafkaMetricsGroup { val RecoveryPointCheckpointFile = "recovery-point-offset-checkpoint" val LogStartOffsetCheckpointFile = "log-start-offset-checkpoint" val LockFile = ".lock" @@ -66,42 +71,119 @@ class LogManager(val logDirs: Array[File], private val logs = new Pool[TopicPartition, Log]() private val logsToBeDeleted = new LinkedBlockingQueue[Log]() - createAndValidateLogDirs(logDirs) - private val dirLocks = lockLogDirs(logDirs) - private val recoveryPointCheckpoints = logDirs.map(dir => (dir, new OffsetCheckpointFile(new File(dir, RecoveryPointCheckpointFile)))).toMap - private val logStartOffsetCheckpoints = logDirs.map(dir => (dir, new OffsetCheckpointFile(new File(dir, LogStartOffsetCheckpointFile)))).toMap + private val _liveLogDirs: ConcurrentLinkedQueue[File] = createAndValidateLogDirs(logDirs, initialOfflineDirs) + + def liveLogDirs: Seq[File] = { + if (_liveLogDirs.size() == logDirs.size) + logDirs + else + _liveLogDirs.asScala.toSeq + } + + @volatile private var recoveryPointCheckpoints = liveLogDirs.map(dir => (dir, new OffsetCheckpointFile(new File(dir, RecoveryPointCheckpointFile)))).toMap + @volatile private var logStartOffsetCheckpoints = liveLogDirs.map(dir => (dir, new OffsetCheckpointFile(new File(dir, LogStartOffsetCheckpointFile)))).toMap + + private def offlineLogDirs = logDirs.filterNot(_liveLogDirs.contains) + loadLogs() + private val dirLocks = lockLogDirs(liveLogDirs) + // public, so we can access this from kafka.admin.DeleteTopicTest val cleaner: LogCleaner = if(cleanerConfig.enableCleaner) - new LogCleaner(cleanerConfig, logDirs, logs, time = time) + new LogCleaner(cleanerConfig, liveLogDirs, logs, logDirFailureChannel, time = time) else null - + + val offlineLogDirectoryCount = newGauge( + "OfflineLogDirectoryCount", + new Gauge[Int] { + def value = offlineLogDirs.length + } + ) + + for (dir <- logDirs) { + newGauge( + "OfflineLogDirectoryCount", + new Gauge[Int] { + def value = if (_liveLogDirs.contains(dir)) 0 else 1 + }, + Map("logDirectory" -> dir.getAbsolutePath) + ) + } + /** * Create and check validity of the given directories, specifically: *
    *
  1. Ensure that there are no duplicates in the directory list *
  2. Create each directory if it doesn't exist - *
  3. Check that each path is a readable directory + *
  4. Check that each path is a readable directory *
*/ - private def createAndValidateLogDirs(dirs: Seq[File]) { + private def createAndValidateLogDirs(dirs: Seq[File], initialOfflineDirs: Seq[File]): ConcurrentLinkedQueue[File] = { if(dirs.map(_.getCanonicalPath).toSet.size < dirs.size) - throw new KafkaException("Duplicate log directory found: " + logDirs.mkString(", ")) - for(dir <- dirs) { - if(!dir.exists) { - info("Log directory '" + dir.getAbsolutePath + "' not found, creating it.") - val created = dir.mkdirs() - if(!created) - throw new KafkaException("Failed to create data directory " + dir.getAbsolutePath) + throw new KafkaException("Duplicate log directory found: " + dirs.mkString(", ")) + + val liveLogDirs = new ConcurrentLinkedQueue[File]() + + for (dir <- dirs if !initialOfflineDirs.contains(dir)) { + try { + if (!dir.exists) { + info("Log directory '" + dir.getAbsolutePath + "' not found, creating it.") + val created = dir.mkdirs() + if (!created) + throw new KafkaStorageException("Failed to create data directory " + dir.getAbsolutePath) + } + if (!dir.isDirectory || !dir.canRead) + throw new KafkaStorageException(dir.getAbsolutePath + " is not a readable log directory.") + liveLogDirs.add(dir) + } catch { + case e@ ( _: IOException | _: KafkaStorageException ) => + error(s"Failed to create or validate data directory $dir.getAbsolutePath", e) } - if(!dir.isDirectory || !dir.canRead) - throw new KafkaException(dir.getAbsolutePath + " is not a readable log directory.") } + if (liveLogDirs.isEmpty) { + fatal(s"Shutdown broker because none of the specified log dirs from " + dirs.mkString(", ") + " can be created or validated") + Runtime.getRuntime().halt(1) + } + + liveLogDirs } - + + def handleLogDirFailure(dir: String) { + if (!logDirs.exists(_.getAbsolutePath == dir)) + throw new RuntimeException(s"Log dir $dir is not found in the config.") + + if (!_liveLogDirs.contains(new File(dir))) + return + + info(s"Stopping serving logs in dir $dir") + logCreationOrDeletionLock synchronized { + _liveLogDirs.remove(new File(dir)) + if (_liveLogDirs.isEmpty) { + fatal(s"Shutdown broker because all log dirs in ${logDirs.mkString(", ")} have failed") + Runtime.getRuntime().halt(1) + } + + recoveryPointCheckpoints = recoveryPointCheckpoints.filterKeys(file => file.getAbsolutePath != dir) + logStartOffsetCheckpoints = logStartOffsetCheckpoints.filterKeys(file => file.getAbsolutePath != dir) + cleaner.handleLogDirFailure(dir) + + val offlineTopicPartitions = logs.filter { case (tp, log) => log.dir.getParent == dir}.map { case (tp, log) => tp} + + offlineTopicPartitions.foreach(topicPartition => { + val removedLog = logs.remove(topicPartition) + if (removedLog != null) { + removedLog.closeHandlers() + removedLog.removeLogMetrics() + } + }) + info(s"Partitions ${offlineTopicPartitions.mkString(",")} are offline due to failure on log directory $dir") + dirLocks.filter(_.file.getParent == dir).foreach(_.destroy()) + } + } + /** * Lock all the given directories */ @@ -109,97 +191,114 @@ class LogManager(val logDirs: Array[File], dirs.map { dir => val lock = new FileLock(new File(dir, LockFile)) if(!lock.tryLock()) - throw new KafkaException("Failed to acquire lock on file .lock in " + lock.file.getParentFile.getAbsolutePath + + throw new KafkaException("Failed to acquire lock on file .lock in " + lock.file.getParentFile.getAbsolutePath + ". A Kafka instance in another process or thread is using this directory.") lock } } - + + private def loadLogs(logDir: File, recoveryPoints: Map[TopicPartition, Long], logStartOffsets: Map[TopicPartition, Long]): Unit = { + debug("Loading log '" + logDir.getName + "'") + val topicPartition = Log.parseTopicPartitionName(logDir) + val config = topicConfigs.getOrElse(topicPartition.topic, defaultConfig) + val logRecoveryPoint = recoveryPoints.getOrElse(topicPartition, 0L) + val logStartOffset = logStartOffsets.getOrElse(topicPartition, 0L) + + val current = Log( + dir = logDir, + config = config, + logStartOffset = logStartOffset, + recoveryPoint = logRecoveryPoint, + maxProducerIdExpirationMs = maxPidExpirationMs, + scheduler = scheduler, + time = time, + brokerTopicStats = brokerTopicStats) + + if (logDir.getName.endsWith(Log.DeleteDirSuffix)) { + this.logsToBeDeleted.add(current) + } else { + val previous = this.logs.put(topicPartition, current) + if (previous != null) { + throw new IllegalArgumentException( + "Duplicate log directories found: %s, %s!".format( + current.dir.getAbsolutePath, previous.dir.getAbsolutePath)) + } + } + } + /** * Recover and load all logs in the given data directories */ private def loadLogs(): Unit = { info("Loading logs.") val startMs = time.milliseconds - val threadPools = mutable.ArrayBuffer.empty[ExecutorService] + val threadPools = ArrayBuffer.empty[ExecutorService] + val offlineDirs = ArrayBuffer.empty[String] val jobs = mutable.Map.empty[File, Seq[Future[_]]] - for (dir <- this.logDirs) { - val pool = Executors.newFixedThreadPool(ioThreads) - threadPools.append(pool) - - val cleanShutdownFile = new File(dir, Log.CleanShutdownFile) - - if (cleanShutdownFile.exists) { - debug( - "Found clean shutdown file. " + - "Skipping recovery for all logs in data directory: " + - dir.getAbsolutePath) - } else { - // log recovery itself is being performed by `Log` class during initialization - brokerState.newState(RecoveringFromUncleanShutdown) - } - - var recoveryPoints = Map[TopicPartition, Long]() + for (dir <- liveLogDirs) { try { - recoveryPoints = this.recoveryPointCheckpoints(dir).read - } catch { - case e: Exception => - warn("Error occurred while reading recovery-point-offset-checkpoint file of directory " + dir, e) - warn("Resetting the recovery checkpoint to 0") - } + val pool = Executors.newFixedThreadPool(ioThreads) + + val cleanShutdownFile = new File(dir, Log.CleanShutdownFile) + + if (cleanShutdownFile.exists) { + debug( + "Found clean shutdown file. " + + "Skipping recovery for all logs in data directory: " + + dir.getAbsolutePath) + } else { + // log recovery itself is being performed by `Log` class during initialization + brokerState.newState(RecoveringFromUncleanShutdown) + } - var logStartOffsets = Map[TopicPartition, Long]() - try { - logStartOffsets = this.logStartOffsetCheckpoints(dir).read - } catch { - case e: Exception => - warn("Error occurred while reading log-start-offset-checkpoint file of directory " + dir, e) - } + var recoveryPoints = Map[TopicPartition, Long]() + try { + recoveryPoints = this.recoveryPointCheckpoints(dir).read + } catch { + case e: Exception => + warn("Error occurred while reading recovery-point-offset-checkpoint file of directory " + dir, e) + warn("Resetting the recovery checkpoint to 0") + } - val jobsForDir = for { - dirContent <- Option(dir.listFiles).toList - logDir <- dirContent if logDir.isDirectory - } yield { - CoreUtils.runnable { - debug("Loading log '" + logDir.getName + "'") - - val topicPartition = Log.parseTopicPartitionName(logDir) - val config = topicConfigs.getOrElse(topicPartition.topic, defaultConfig) - val logRecoveryPoint = recoveryPoints.getOrElse(topicPartition, 0L) - val logStartOffset = logStartOffsets.getOrElse(topicPartition, 0L) - - val current = Log( - dir = logDir, - config = config, - logStartOffset = logStartOffset, - recoveryPoint = logRecoveryPoint, - maxProducerIdExpirationMs = maxPidExpirationMs, - scheduler = scheduler, - time = time, - brokerTopicStats = brokerTopicStats) - if (logDir.getName.endsWith(Log.DeleteDirSuffix)) { - this.logsToBeDeleted.add(current) - } else { - val previous = this.logs.put(topicPartition, current) - if (previous != null) { - throw new IllegalArgumentException( - "Duplicate log directories found: %s, %s!".format( - current.dir.getAbsolutePath, previous.dir.getAbsolutePath)) + var logStartOffsets = Map[TopicPartition, Long]() + try { + logStartOffsets = this.logStartOffsetCheckpoints(dir).read + } catch { + case e: Exception => + warn("Error occurred while reading log-start-offset-checkpoint file of directory " + dir, e) + } + + val jobsForDir = for { + dirContent <- Option(dir.listFiles).toList + logDir <- dirContent if logDir.isDirectory + } yield { + CoreUtils.runnable { + try { + loadLogs(logDir, recoveryPoints, logStartOffsets) + } catch { + case e: IOException => + offlineDirs.append(dir.getAbsolutePath) + error("Error while load log dir " + dir.getAbsolutePath, e) } } } - } - jobs(cleanShutdownFile) = jobsForDir.map(pool.submit) + threadPools.append(pool) + jobs(cleanShutdownFile) = jobsForDir.map(pool.submit) + } catch { + case e: IOException => + offlineDirs.append(dir.getAbsolutePath) + error("Error while load log dir " + dir.getAbsolutePath, e) + } } - try { for ((cleanShutdownFile, dirJobs) <- jobs) { dirJobs.foreach(_.get) cleanShutdownFile.delete() } + offlineDirs.foreach(logDirFailureChannel.maybeAddLogFailureEvent) } catch { case e: ExecutionException => { error("There was an error in one of the threads during logs loading: " + e.getCause) @@ -231,7 +330,7 @@ class LogManager(val logDirs: Array[File], period = flushCheckMs, TimeUnit.MILLISECONDS) scheduler.schedule("kafka-recovery-point-checkpoint", - checkpointRecoveryPointOffsets _, + checkpointLogRecoveryOffsets _, delay = InitialTaskDelayMs, period = flushRecoveryOffsetCheckpointMs, TimeUnit.MILLISECONDS) @@ -256,7 +355,12 @@ class LogManager(val logDirs: Array[File], def shutdown() { info("Shutting down.") - val threadPools = mutable.ArrayBuffer.empty[ExecutorService] + removeMetric("OfflineLogDirectoryCount") + for (dir <- logDirs) { + removeMetric("OfflineLogDirectoryCount", Map("logDirectory" -> dir.getAbsolutePath)) + } + + val threadPools = ArrayBuffer.empty[ExecutorService] val jobs = mutable.Map.empty[File, Seq[Future[_]]] // stop the cleaner first @@ -265,7 +369,7 @@ class LogManager(val logDirs: Array[File], } // close logs in each dir - for (dir <- this.logDirs) { + for (dir <- liveLogDirs) { debug("Flushing and closing logs at " + dir) val pool = Executors.newFixedThreadPool(ioThreads) @@ -331,17 +435,22 @@ class LogManager(val logDirs: Array[File], log.truncateTo(truncateOffset) if (needToStopCleaner) cleaner.maybeTruncateCheckpoint(log.dir.getParentFile, topicPartition, log.activeSegment.baseOffset) + } catch { + case e: IOException => + error(s"Failed to truncate the log for $topicPartition due to IOException", e) + logDirFailureChannel.maybeAddLogFailureEvent(log.dir.getParent) } finally { if (needToStopCleaner) cleaner.resumeCleaning(topicPartition) } } } - checkpointRecoveryPointOffsets() + checkpointLogRecoveryOffsets() } /** * Delete all data in a partition and start the log at the new offset + * * @param newOffset The new offset to start the log with */ def truncateFullyAndStartAt(topicPartition: TopicPartition, newOffset: Long) { @@ -351,21 +460,27 @@ class LogManager(val logDirs: Array[File], //Abort and pause the cleaning of the log, and resume after truncation is done. if (cleaner != null) cleaner.abortAndPauseCleaning(topicPartition) - log.truncateFullyAndStartAt(newOffset) + try { + log.truncateFullyAndStartAt(newOffset) + } catch { + case e: IOException => + error(s"Failed to fully truncate the log for $topicPartition due to IOException", e) + logDirFailureChannel.maybeAddLogFailureEvent(log.dir.getParent) + } if (cleaner != null) { cleaner.maybeTruncateCheckpoint(log.dir.getParentFile, topicPartition, log.activeSegment.baseOffset) cleaner.resumeCleaning(topicPartition) } } - checkpointRecoveryPointOffsets() + checkpointLogRecoveryOffsets() } /** - * Write out the current recovery point for all logs to a text file in the log directory + * Write out the current recovery point for all logs to a text file in the log directory * to avoid recovering the whole log on startup. */ - def checkpointRecoveryPointOffsets() { - this.logDirs.foreach(checkpointLogRecoveryOffsetsInDir) + def checkpointLogRecoveryOffsets() { + liveLogDirs.foreach(checkpointLogRecoveryOffsetsInDir) } /** @@ -373,7 +488,7 @@ class LogManager(val logDirs: Array[File], * to avoid exposing data that have been deleted by DeleteRecordsRequest */ def checkpointLogStartOffsets() { - this.logDirs.foreach(checkpointLogStartOffsetsInDir) + liveLogDirs.foreach(checkpointLogStartOffsetsInDir) } /** @@ -382,7 +497,13 @@ class LogManager(val logDirs: Array[File], private def checkpointLogRecoveryOffsetsInDir(dir: File): Unit = { val recoveryPoints = this.logsByDir.get(dir.toString) if (recoveryPoints.isDefined) { - this.recoveryPointCheckpoints(dir).write(recoveryPoints.get.mapValues(_.recoveryPoint)) + try { + this.recoveryPointCheckpoints.get(dir).foreach(_.write(recoveryPoints.get.mapValues(_.recoveryPoint))) + } catch { + case e: IOException => + error("Disk error while writing to recovery point file", e) + logDirFailureChannel.maybeAddLogFailureEvent(dir.getAbsolutePath) + } } } @@ -392,8 +513,15 @@ class LogManager(val logDirs: Array[File], private def checkpointLogStartOffsetsInDir(dir: File): Unit = { val logs = this.logsByDir.get(dir.toString) if (logs.isDefined) { - this.logStartOffsetCheckpoints(dir).write( - logs.get.filter{case (tp, log) => log.logStartOffset > log.logSegments.head.baseOffset}.mapValues(_.logStartOffset)) + try { + this.logStartOffsetCheckpoints.get(dir).foreach(_.write( + logs.get.filter { case (tp, log) => log.logStartOffset > log.logSegments.head.baseOffset }.mapValues(_.logStartOffset) + )) + } catch { + case e: IOException => + error("Disk error while writing to logStartOffset file", e) + logDirFailureChannel.maybeAddLogFailureEvent(dir.getAbsolutePath) + } } } @@ -403,13 +531,20 @@ class LogManager(val logDirs: Array[File], def getLog(topicPartition: TopicPartition): Option[Log] = Option(logs.get(topicPartition)) /** - * Create a log for the given topic and the given partition * If the log already exists, just return a copy of the existing log + * Otherwise if isNew=true or if there is no offline log directory, create a log for the given topic and the given partition + * Otherwise throw KafkaStorageException + * + * @param isNew Whether the replica should have existed on the broker or not + * @throws KafkaStorageException if isNew=false, log is not found in the cache and there is offline log directory on the broker */ - def createLog(topicPartition: TopicPartition, config: LogConfig): Log = { + def getOrCreateLog(topicPartition: TopicPartition, config: LogConfig, isNew: Boolean = false): Log = { logCreationOrDeletionLock synchronized { - // create the log if it has not already been created in another thread getLog(topicPartition).getOrElse { + // create the log if it has not already been created in another thread + if (!isNew && offlineLogDirs.nonEmpty) + throw new KafkaStorageException(s"Can not create log for $topicPartition because log directories ${offlineLogDirs.mkString(",")} are offline") + val dataDir = nextLogDir() val dir = new File(dataDir, topicPartition.topic + "-" + topicPartition.partition) Files.createDirectories(dir.toPath) @@ -424,6 +559,7 @@ class LogManager(val logDirs: Array[File], time = time, brokerTopicStats = brokerTopicStats) logs.put(topicPartition, log) + info("Created log for partition [%s,%d] in %s with properties {%s}." .format(topicPartition.topic, topicPartition.partition, @@ -439,30 +575,29 @@ class LogManager(val logDirs: Array[File], */ private def deleteLogs(): Unit = { try { - var failed = 0 - while (!logsToBeDeleted.isEmpty && failed < logsToBeDeleted.size()) { + while (!logsToBeDeleted.isEmpty) { val removedLog = logsToBeDeleted.take() if (removedLog != null) { try { removedLog.delete() info(s"Deleted log for partition ${removedLog.topicPartition} in ${removedLog.dir.getAbsolutePath}.") } catch { - case e: Throwable => - error(s"Exception in deleting $removedLog. Moving it to the end of the queue.", e) - failed = failed + 1 - logsToBeDeleted.put(removedLog) + case e: IOException => + error(s"Exception while deleting $removedLog.", e) + logDirFailureChannel.maybeAddLogFailureEvent(removedLog.dir.getParentFile.getAbsolutePath) } } } } catch { - case e: Throwable => + case e: Throwable => error(s"Exception in kafka-delete-logs thread.", e) } } /** - * Rename the directory of the given topic-partition "logdir" as "logdir.uuid.delete" and - * add it in the queue for deletion. + * Rename the directory of the given topic-partition "logdir" as "logdir.uuid.delete" and + * add it in the queue for deletion. + * * @param topicPartition TopicPartition that needs to be deleted */ def asyncDelete(topicPartition: TopicPartition) = { @@ -494,6 +629,8 @@ class LogManager(val logDirs: Array[File], } else { throw new KafkaStorageException("Failed to rename log directory from " + removedLog.dir.getAbsolutePath + " to " + renamedDir.getAbsolutePath) } + } else if (offlineLogDirs.nonEmpty) { + throw new KafkaStorageException("Can not delete log for " + topicPartition + " because it may be on offline directories " + offlineLogDirs.mkString(",")) } } @@ -503,14 +640,14 @@ class LogManager(val logDirs: Array[File], * data directory with the fewest partitions. */ private def nextLogDir(): File = { - if(logDirs.size == 1) { - logDirs(0) + if(_liveLogDirs.size == 1) { + _liveLogDirs.peek() } else { // count the number of logs in each parent directory (including 0 for empty directories val logCounts = allLogs.groupBy(_.dir.getParent).mapValues(_.size) - val zeros = logDirs.map(dir => (dir.getPath, 0)).toMap + val zeros = _liveLogDirs.asScala.map(dir => (dir.getPath, 0)).toMap val dirCounts = (zeros ++ logCounts).toBuffer - + // choose the directory with the least logs in it val leastLoaded = dirCounts.sortBy(_._2).head new File(leastLoaded._1) @@ -552,6 +689,10 @@ class LogManager(val logDirs: Array[File], } } + def isLogDirOnline(logDir: String): Boolean = { + _liveLogDirs.contains(new File(logDir)) + } + /** * Flush any log which has exceeded its flush interval and has unwritten messages. */ @@ -575,11 +716,13 @@ class LogManager(val logDirs: Array[File], object LogManager { def apply(config: KafkaConfig, + initialOfflineDirs: Seq[String], zkUtils: ZkUtils, brokerState: BrokerState, kafkaScheduler: KafkaScheduler, time: Time, - brokerTopicStats: BrokerTopicStats): LogManager = { + brokerTopicStats: BrokerTopicStats, + logDirFailureChannel: LogDirFailureChannel): LogManager = { val defaultProps = KafkaServer.copyKafkaConfigToLog(config) val defaultLogConfig = LogConfig(defaultProps) @@ -598,6 +741,7 @@ object LogManager { enableCleaner = config.logCleanerEnable) new LogManager(logDirs = config.logDirs.map(new File(_)).toArray, + initialOfflineDirs = initialOfflineDirs.map(new File(_)).toArray, topicConfigs = topicConfigs, defaultConfig = defaultLogConfig, cleanerConfig = cleanerConfig, @@ -609,7 +753,8 @@ object LogManager { maxPidExpirationMs = config.transactionIdExpirationMs, scheduler = kafkaScheduler, brokerState = brokerState, - time = time, - brokerTopicStats = brokerTopicStats) + brokerTopicStats = brokerTopicStats, + logDirFailureChannel = logDirFailureChannel, + time = time) } } diff --git a/core/src/main/scala/kafka/log/LogSegment.scala b/core/src/main/scala/kafka/log/LogSegment.scala index 3e4c47def9cc5..70eef899d2096 100755 --- a/core/src/main/scala/kafka/log/LogSegment.scala +++ b/core/src/main/scala/kafka/log/LogSegment.scala @@ -480,6 +480,16 @@ class LogSegment(val log: FileRecords, CoreUtils.swallow(txnIndex.close()) } + /** + * Close file handlers used by the log segment but don't write to disk. This is used when the disk may have failed + */ + def closeHandlers() { + CoreUtils.swallow(index.closeHandler()) + CoreUtils.swallow(timeIndex.closeHandler()) + CoreUtils.swallow(log.closeHandlers()) + CoreUtils.swallow(txnIndex.close()) + } + /** * Delete this log segment from the filesystem. * diff --git a/core/src/main/scala/kafka/server/AbstractFetcherThread.scala b/core/src/main/scala/kafka/server/AbstractFetcherThread.scala index b17d255e05f8c..5c2fdbe8a577d 100755 --- a/core/src/main/scala/kafka/server/AbstractFetcherThread.scala +++ b/core/src/main/scala/kafka/server/AbstractFetcherThread.scala @@ -18,22 +18,18 @@ package kafka.server import java.util.concurrent.locks.ReentrantLock - import kafka.cluster.BrokerEndPoint -import kafka.consumer.PartitionTopicInfo import kafka.utils.{DelayedItem, Pool, ShutdownableThread} -import kafka.common.{ClientIdAndBroker, KafkaException} +import kafka.common.{ClientIdAndBroker, KafkaException, KafkaStorageException} import kafka.metrics.KafkaMetricsGroup import kafka.utils.CoreUtils.inLock import org.apache.kafka.common.errors.CorruptRecordException import org.apache.kafka.common.protocol.Errors import AbstractFetcherThread._ - import scala.collection.{Map, Set, mutable} import scala.collection.JavaConverters._ import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicLong - import com.yammer.metrics.core.Gauge import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.internals.{FatalExitError, PartitionStates} @@ -72,7 +68,7 @@ abstract class AbstractFetcherThread(name: String, protected def handleOffsetOutOfRange(topicPartition: TopicPartition): Long // deal with partitions with errors, potentially due to leadership changes - protected def handlePartitionsWithErrors(partitions: Iterable[TopicPartition]) + protected def handlePartitionsWithErrors(partitions: Map[TopicPartition, Option[Exception]]) protected def buildLeaderEpochRequest(allPartitions: Seq[(TopicPartition, PartitionFetchState)]): Map[TopicPartition, Int] @@ -137,10 +133,10 @@ abstract class AbstractFetcherThread(name: String, } private def processFetchRequest(fetchRequest: REQ) { - val partitionsWithError = mutable.Set[TopicPartition]() + val partitionsWithError = mutable.HashMap[TopicPartition, Option[Exception]]() - def updatePartitionsWithError(partition: TopicPartition): Unit = { - partitionsWithError += partition + def updatePartitionsWithError(partition: TopicPartition, e: Option[Exception]): Unit = { + partitionsWithError.put(partition, e) partitionStates.moveToEnd(partition) } @@ -154,7 +150,7 @@ abstract class AbstractFetcherThread(name: String, if (isRunning.get) { warn(s"Error in fetch to broker ${sourceBroker.id}, request ${fetchRequest}", t) inLock(partitionMapLock) { - partitionStates.partitionSet.asScala.foreach(updatePartitionsWithError) + partitionStates.partitionSet.asScala.foreach(updatePartitionsWithError(_, None)) // there is an error occurred while fetching partitions, sleep a while // note that `ReplicaFetcherThread.handlePartitionsWithError` will also introduce the same delay for every // partition with error effectively doubling the delay. It would be good to improve this. @@ -198,7 +194,9 @@ abstract class AbstractFetcherThread(name: String, // 2. If the message is corrupt due to a transient state in the log (truncation, partial writes can cause this), we simply continue and // should get fixed in the subsequent fetches logger.error("Found invalid messages during fetch for partition [" + topic + "," + partitionId + "] offset " + currentPartitionFetchState.fetchOffset + " error " + ime.getMessage) - updatePartitionsWithError(topicPartition); + updatePartitionsWithError(topicPartition, None) + case e: KafkaStorageException => + updatePartitionsWithError(topicPartition, Some(e)) case e: Throwable => throw new KafkaException("error processing data for partition [%s,%d] offset %d" .format(topic, partitionId, currentPartitionFetchState.fetchOffset), e) @@ -213,13 +211,13 @@ abstract class AbstractFetcherThread(name: String, case e: FatalExitError => throw e case e: Throwable => error("Error getting offset for partition [%s,%d] to broker %d".format(topic, partitionId, sourceBroker.id), e) - updatePartitionsWithError(topicPartition) + updatePartitionsWithError(topicPartition, None) } - case _ => + case e => if (isRunning.get) { error("Error for partition [%s,%d] to broker %d:%s".format(topic, partitionId, sourceBroker.id, partitionData.exception.get)) - updatePartitionsWithError(topicPartition) + updatePartitionsWithError(topicPartition, None) } } }) diff --git a/core/src/main/scala/kafka/server/BrokerMetadataCheckpoint.scala b/core/src/main/scala/kafka/server/BrokerMetadataCheckpoint.scala index 8630026d1f711..8ac9864122e9c 100755 --- a/core/src/main/scala/kafka/server/BrokerMetadataCheckpoint.scala +++ b/core/src/main/scala/kafka/server/BrokerMetadataCheckpoint.scala @@ -30,7 +30,6 @@ case class BrokerMetadata(brokerId: Int) */ class BrokerMetadataCheckpoint(val file: File) extends Logging { private val lock = new Object() - Files.deleteIfExists(new File(file + ".tmp").toPath()) // try to delete any existing temp files for cleanliness def write(brokerMetadata: BrokerMetadata) = { lock synchronized { @@ -57,6 +56,8 @@ class BrokerMetadataCheckpoint(val file: File) extends Logging { } def read(): Option[BrokerMetadata] = { + Files.deleteIfExists(new File(file + ".tmp").toPath()) // try to delete any existing temp files for cleanliness + lock synchronized { try { val brokerMetaProps = new VerifiableProperties(Utils.loadProps(file.getAbsolutePath())) diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 9e9299fde14ad..fb0c51e2b8d9e 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -54,7 +54,7 @@ import org.apache.kafka.common.requests.SaslHandshakeResponse import org.apache.kafka.common.resource.{Resource => AdminResource} import org.apache.kafka.common.acl.{AccessControlEntry, AclBinding} -import scala.collection._ +import scala.collection.{mutable, _} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.util.{Failure, Success, Try} @@ -144,40 +144,33 @@ class KafkaApis(val requestChannel: RequestChannel, val correlationId = request.header.correlationId val leaderAndIsrRequest = request.body[LeaderAndIsrRequest] - try { - def onLeadershipChange(updatedLeaders: Iterable[Partition], updatedFollowers: Iterable[Partition]) { - // for each new leader or follower, call coordinator to handle consumer group migration. - // this callback is invoked under the replica state change lock to ensure proper order of - // leadership changes - updatedLeaders.foreach { partition => - if (partition.topic == GROUP_METADATA_TOPIC_NAME) - groupCoordinator.handleGroupImmigration(partition.partitionId) - else if (partition.topic == TRANSACTION_STATE_TOPIC_NAME) - txnCoordinator.handleTxnImmigration(partition.partitionId, partition.getLeaderEpoch) - } - - updatedFollowers.foreach { partition => - if (partition.topic == GROUP_METADATA_TOPIC_NAME) - groupCoordinator.handleGroupEmigration(partition.partitionId) - else if (partition.topic == TRANSACTION_STATE_TOPIC_NAME) - txnCoordinator.handleTxnEmigration(partition.partitionId, partition.getLeaderEpoch) - } + def onLeadershipChange(updatedLeaders: Iterable[Partition], updatedFollowers: Iterable[Partition]) { + // for each new leader or follower, call coordinator to handle consumer group migration. + // this callback is invoked under the replica state change lock to ensure proper order of + // leadership changes + updatedLeaders.foreach { partition => + if (partition.topic == GROUP_METADATA_TOPIC_NAME) + groupCoordinator.handleGroupImmigration(partition.partitionId) + else if (partition.topic == TRANSACTION_STATE_TOPIC_NAME) + txnCoordinator.handleTxnImmigration(partition.partitionId, partition.getLeaderEpoch) } - if (authorize(request.session, ClusterAction, Resource.ClusterResource)) { - val result = replicaManager.becomeLeaderOrFollower(correlationId, leaderAndIsrRequest, onLeadershipChange) - val leaderAndIsrResponse = new LeaderAndIsrResponse(result.error, result.responseMap.asJava) - sendResponseExemptThrottle(RequestChannel.Response(request, leaderAndIsrResponse)) - } else { - val result = leaderAndIsrRequest.partitionStates.asScala.keys.map((_, Errors.CLUSTER_AUTHORIZATION_FAILED)).toMap - sendResponseMaybeThrottle(request, _ => - new LeaderAndIsrResponse(Errors.CLUSTER_AUTHORIZATION_FAILED, result.asJava)) + updatedFollowers.foreach { partition => + if (partition.topic == GROUP_METADATA_TOPIC_NAME) + groupCoordinator.handleGroupEmigration(partition.partitionId) + else if (partition.topic == TRANSACTION_STATE_TOPIC_NAME) + txnCoordinator.handleTxnEmigration(partition.partitionId, partition.getLeaderEpoch) } - } catch { - case e: FatalExitError => throw e - case e: KafkaStorageException => - fatal("Disk error during leadership change.", e) - Exit.halt(1) + } + + if (authorize(request.session, ClusterAction, Resource.ClusterResource)) { + val result = replicaManager.becomeLeaderOrFollower(correlationId, leaderAndIsrRequest, onLeadershipChange) + val leaderAndIsrResponse = new LeaderAndIsrResponse(result.error, result.responseMap.asJava) + sendResponseExemptThrottle(RequestChannel.Response(request, leaderAndIsrResponse)) + } else { + val result = leaderAndIsrRequest.partitionStates.asScala.keys.map((_, Errors.CLUSTER_AUTHORIZATION_FAILED)).toMap + sendResponseMaybeThrottle(request, _ => + new LeaderAndIsrResponse(Errors.CLUSTER_AUTHORIZATION_FAILED, result.asJava)) } } @@ -1865,7 +1858,7 @@ class KafkaApis(val requestChannel: RequestChannel, Some(new AclDeletionResult(ApiError.fromThrowable(throwable), aclBinding)) } }.asJava - + filterResponseMap.put(i, new AclFilterResponse(deletionResults)) } diff --git a/core/src/main/scala/kafka/server/KafkaServer.scala b/core/src/main/scala/kafka/server/KafkaServer.scala index cc34e147df3ec..fc9e4b81513ca 100755 --- a/core/src/main/scala/kafka/server/KafkaServer.scala +++ b/core/src/main/scala/kafka/server/KafkaServer.scala @@ -47,7 +47,7 @@ import org.apache.kafka.common.utils.{AppInfoParser, Time} import org.apache.kafka.common.{ClusterResource, Node} import scala.collection.JavaConverters._ -import scala.collection.{Map, mutable} +import scala.collection.{Seq, Map, mutable} object KafkaServer { // Copy the subset of properties that are relevant to Logs @@ -110,6 +110,7 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP var socketServer: SocketServer = null var requestHandlerPool: KafkaRequestHandlerPool = null + var logDirFailureChannel: LogDirFailureChannel = null var logManager: LogManager = null var replicaManager: ReplicaManager = null @@ -195,7 +196,8 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP info(s"Cluster ID = $clusterId") /* generate brokerId */ - config.brokerId = getBrokerId + val (brokerId, initialOfflineDirs) = getBrokerIdAndOfflineDirs + config.brokerId = brokerId this.logIdent = "[Kafka Server " + config.brokerId + "], " /* create and configure metrics */ @@ -211,8 +213,10 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP quotaManagers = QuotaFactory.instantiate(config, metrics, time) notifyClusterListeners(kafkaMetricsReporters ++ reporters.asScala) + logDirFailureChannel = new LogDirFailureChannel(config.logDirs.size) + /* start log manager */ - logManager = LogManager(config, zkUtils, brokerState, kafkaScheduler, time, brokerTopicStats) + logManager = LogManager(config, initialOfflineDirs, zkUtils, brokerState, kafkaScheduler, time, brokerTopicStats, logDirFailureChannel) logManager.startup() metadataCache = new MetadataCache(config.brokerId) @@ -307,7 +311,7 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP protected def createReplicaManager(isShuttingDown: AtomicBoolean): ReplicaManager = new ReplicaManager(config, metrics, time, zkUtils, kafkaScheduler, logManager, isShuttingDown, quotaManagers.follower, - brokerTopicStats, metadataCache) + brokerTopicStats, metadataCache, logDirFailureChannel) private def initZk(): ZkUtils = { info(s"Connecting to zookeeper on ${config.zkConnect}") @@ -582,7 +586,7 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP if (shutdownLatch.getCount > 0 && isShuttingDown.compareAndSet(false, true)) { CoreUtils.swallow(controlledShutdown()) brokerState.newState(BrokerShuttingDown) - + if (socketServer != null) CoreUtils.swallow(socketServer.shutdown()) if (requestHandlerPool != null) @@ -651,16 +655,25 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP *
  • config has broker.id and there is no meta.properties file, creates new meta.properties and stores broker.id *
      * - * @return A brokerId. + * The log directories whose meta.properties can not be accessed due to IOException will be returned to the caller + * + * @return A 2-tuple containing the brokerId and a sequence of offline log directories. */ - private def getBrokerId: Int = { + private def getBrokerIdAndOfflineDirs: (Int, Seq[String]) = { var brokerId = config.brokerId val brokerIdSet = mutable.HashSet[Int]() + val offlineDirs = mutable.ArrayBuffer.empty[String] for (logDir <- config.logDirs) { - val brokerMetadataOpt = brokerMetadataCheckpoints(logDir).read() - brokerMetadataOpt.foreach { brokerMetadata => - brokerIdSet.add(brokerMetadata.brokerId) + try { + val brokerMetadataOpt = brokerMetadataCheckpoints(logDir).read() + brokerMetadataOpt.foreach { brokerMetadata => + brokerIdSet.add(brokerMetadata.brokerId) + } + } catch { + case e : IOException => + offlineDirs += logDir + error(s"Fail to read ${brokerMetaPropsFile} under log directory ${logDir}", e) } } @@ -678,16 +691,17 @@ class KafkaServer(val config: KafkaConfig, time: Time = Time.SYSTEM, threadNameP else if(brokerIdSet.size == 1) // pick broker.id from meta.properties brokerId = brokerIdSet.last - brokerId + + (brokerId, offlineDirs) } private def checkpointBrokerId(brokerId: Int) { var logDirsWithoutMetaProps: List[String] = List() - for (logDir <- config.logDirs) { - val brokerMetadataOpt = brokerMetadataCheckpoints(logDir).read() + for (logDir <- logManager.liveLogDirs) { + val brokerMetadataOpt = brokerMetadataCheckpoints(logDir.getAbsolutePath).read() if(brokerMetadataOpt.isEmpty) - logDirsWithoutMetaProps ++= List(logDir) + logDirsWithoutMetaProps ++= List(logDir.getAbsolutePath) } for(logDir <- logDirsWithoutMetaProps) { diff --git a/core/src/main/scala/kafka/server/LogDirFailureChannel.scala b/core/src/main/scala/kafka/server/LogDirFailureChannel.scala new file mode 100644 index 0000000000000..1e8acc2aa0819 --- /dev/null +++ b/core/src/main/scala/kafka/server/LogDirFailureChannel.scala @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package kafka.server + +import java.util.concurrent.{ArrayBlockingQueue, ConcurrentHashMap, TimeUnit} + +/* + * LogDirFailureChannel allows an external thread to block waiting for new offline log dir + * Classes such as ReplicaManager and LogManager can announce new offline log dir via LogDirFailureChannel. + */ +class LogDirFailureChannel(logDirNum: Int) { + + private val offlineLogDirs = new ConcurrentHashMap[String, String] + private val logDirFailureEvent = new ArrayBlockingQueue[String](logDirNum) + + /* + * If the given logDir is not already offline, add it to the + * set of offline log dirs and enqueue it to the logDirFailureEvent queue + */ + def maybeAddLogFailureEvent(logDir: String): Unit = { + if (offlineLogDirs.putIfAbsent(logDir, logDir) == null) { + logDirFailureEvent.add(logDir) + } + } + + /* + * Get the next offline log dir from logDirFailureEvent queue. + * Block waiting for up to 300 ms if there is no new offline log dir. + */ + def takeNextLogFailureEvent(timeout: Long): String = { + logDirFailureEvent.poll(timeout, TimeUnit.MILLISECONDS) + } + +} diff --git a/core/src/main/scala/kafka/server/MetadataCache.scala b/core/src/main/scala/kafka/server/MetadataCache.scala index 466645bba7d4a..fb8450443498e 100755 --- a/core/src/main/scala/kafka/server/MetadataCache.scala +++ b/core/src/main/scala/kafka/server/MetadataCache.scala @@ -31,6 +31,7 @@ import org.apache.kafka.common.internals.Topic import org.apache.kafka.common.{Node, TopicPartition} import org.apache.kafka.common.network.ListenerName import org.apache.kafka.common.protocol.Errors +import org.apache.kafka.common.requests.UpdateMetadataRequest.PartitionState import org.apache.kafka.common.requests.{MetadataResponse, PartitionState, UpdateMetadataRequest} /** @@ -39,7 +40,7 @@ import org.apache.kafka.common.requests.{MetadataResponse, PartitionState, Updat */ class MetadataCache(brokerId: Int) extends Logging { private val stateChangeLogger = KafkaController.stateChangeLogger - private val cache = mutable.Map[String, mutable.Map[Int, PartitionStateInfo]]() + private val cache = mutable.Map[String, mutable.Map[Int, MetadataPartitionState]]() private var controllerId: Option[Int] = None private val aliveBrokers = mutable.Map[Int, Broker]() private val aliveNodes = mutable.Map[Int, collection.Map[ListenerName, Node]]() @@ -73,12 +74,13 @@ class MetadataCache(brokerId: Int) extends Logging { val replicas = partitionState.allReplicas val replicaInfo = getEndpoints(replicas, listenerName, errorUnavailableEndpoints) + val offlineReplicaInfo = getEndpoints(partitionState.offlineReplicas, listenerName, errorUnavailableEndpoints) maybeLeader match { case None => debug(s"Error while fetching metadata for $topicPartition: leader not available") new MetadataResponse.PartitionMetadata(Errors.LEADER_NOT_AVAILABLE, partitionId, Node.noNode(), - replicaInfo.asJava, java.util.Collections.emptyList()) + replicaInfo.asJava, java.util.Collections.emptyList(), offlineReplicaInfo.asJava) case Some(leader) => val isr = leaderAndIsr.isr @@ -89,15 +91,15 @@ class MetadataCache(brokerId: Int) extends Logging { s"following brokers ${replicas.filterNot(replicaInfo.map(_.id).contains).mkString(",")}") new MetadataResponse.PartitionMetadata(Errors.REPLICA_NOT_AVAILABLE, partitionId, leader, - replicaInfo.asJava, isrInfo.asJava) + replicaInfo.asJava, isrInfo.asJava, offlineReplicaInfo.asJava) } else if (isrInfo.size < isr.size) { debug(s"Error while fetching metadata for $topicPartition: in sync replica information not available for " + s"following brokers ${isr.filterNot(isrInfo.map(_.id).contains).mkString(",")}") new MetadataResponse.PartitionMetadata(Errors.REPLICA_NOT_AVAILABLE, partitionId, leader, - replicaInfo.asJava, isrInfo.asJava) + replicaInfo.asJava, isrInfo.asJava, offlineReplicaInfo.asJava) } else { new MetadataResponse.PartitionMetadata(Errors.NONE, partitionId, leader, replicaInfo.asJava, - isrInfo.asJava) + isrInfo.asJava, offlineReplicaInfo.asJava) } } } @@ -147,14 +149,14 @@ class MetadataCache(brokerId: Int) extends Logging { private def addOrUpdatePartitionInfo(topic: String, partitionId: Int, - stateInfo: PartitionStateInfo) { + stateInfo: MetadataPartitionState) { inWriteLock(partitionMetadataLock) { val infos = cache.getOrElseUpdate(topic, mutable.Map()) infos(partitionId) = stateInfo } } - def getPartitionInfo(topic: String, partitionId: Int): Option[PartitionStateInfo] = { + def getPartitionInfo(topic: String, partitionId: Int): Option[MetadataPartitionState] = { inReadLock(partitionMetadataLock) { cache.get(topic).flatMap(_.get(partitionId)) } @@ -223,10 +225,10 @@ class MetadataCache(brokerId: Int) extends Logging { } } - private def partitionStateToPartitionStateInfo(partitionState: PartitionState): PartitionStateInfo = { + private def partitionStateToPartitionStateInfo(partitionState: UpdateMetadataRequest.PartitionState): MetadataPartitionState = { val leaderAndIsr = LeaderAndIsr(partitionState.leader, partitionState.leaderEpoch, partitionState.isr.asScala.map(_.toInt).toList, partitionState.zkVersion) val leaderInfo = LeaderIsrAndControllerEpoch(leaderAndIsr, partitionState.controllerEpoch) - PartitionStateInfo(leaderInfo, partitionState.replicas.asScala.map(_.toInt)) + MetadataPartitionState(leaderInfo, partitionState.replicas.asScala.map(_.toInt), partitionState.offlineReplicas.asScala.map(_.toInt)) } def contains(topic: String): Boolean = { diff --git a/core/src/main/scala/kafka/server/ReplicaFetcherThread.scala b/core/src/main/scala/kafka/server/ReplicaFetcherThread.scala index d7420dd5fdacd..9a6297ca9b162 100644 --- a/core/src/main/scala/kafka/server/ReplicaFetcherThread.scala +++ b/core/src/main/scala/kafka/server/ReplicaFetcherThread.scala @@ -18,7 +18,7 @@ package kafka.server import java.util - +import java.io.IOException import kafka.admin.AdminUtils import kafka.api.{FetchRequest => _, _} import kafka.cluster.{BrokerEndPoint, Replica} @@ -27,7 +27,6 @@ import kafka.log.LogConfig import kafka.server.ReplicaFetcherThread._ import kafka.server.epoch.LeaderEpochCache import org.apache.kafka.common.requests.EpochEndOffset._ -import kafka.utils.Exit import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.internals.FatalExitError import org.apache.kafka.common.metrics.Metrics @@ -83,10 +82,10 @@ class ReplicaFetcherThread(name: String, // process fetched data def processPartitionData(topicPartition: TopicPartition, fetchOffset: Long, partitionData: PartitionData) { - try { - val replica = replicaMgr.getReplica(topicPartition).get - val records = partitionData.toRecords + val replica = replicaMgr.getReplica(topicPartition).get + val records = partitionData.toRecords + try { maybeWarnIfOversizedRecords(records, topicPartition) if (fetchOffset != replica.logEndOffset.messageOffset) @@ -114,9 +113,9 @@ class ReplicaFetcherThread(name: String, quota.record(records.sizeInBytes) replicaMgr.brokerTopicStats.updateReplicationBytesIn(records.sizeInBytes) } catch { - case e: KafkaStorageException => - fatal(s"Disk error while replicating data for $topicPartition", e) - Exit.halt(1) + case e@ (_: KafkaStorageException | _: IOException) => + error(s"Disk error while replicating data for $topicPartition", e) + throw new KafkaStorageException(s"Disk error while replicating data for $topicPartition", e) } } @@ -199,8 +198,12 @@ class ReplicaFetcherThread(name: String, } // any logic for partitions whose leader has changed - def handlePartitionsWithErrors(partitions: Iterable[TopicPartition]) { - delayPartitions(partitions, brokerConfig.replicaFetchBackoffMs.toLong) + def handlePartitionsWithErrors(partitions: Map[TopicPartition, Option[Exception]]) { + val (partitionsWithStorageException, partitionsWithoutStorageException) = partitions.partition{ case (_, exception) => + exception.isDefined && exception.get.isInstanceOf[KafkaStorageException] + } + partitionsWithStorageException.keys.foreach(tp => replicaMgr.getLogDir(tp).foreach(replicaMgr.maybeAddLogFailureEvent)) + delayPartitions(partitionsWithoutStorageException.keys, brokerConfig.replicaFetchBackoffMs.toLong) } protected def fetch(fetchRequest: FetchRequest): Seq[(TopicPartition, PartitionData)] = { diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 853b7c48a4928..fb1da088a9d8e 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -135,6 +135,7 @@ class ReplicaManager(val config: KafkaConfig, quotaManager: ReplicationQuotaManager, val brokerTopicStats: BrokerTopicStats, val metadataCache: MetadataCache, + logDirFailureChannel: LogDirFailureChannel, val delayedProducePurgatory: DelayedOperationPurgatory[DelayedProduce], val delayedFetchPurgatory: DelayedOperationPurgatory[DelayedFetch], val delayedDeleteRecordsPurgatory: DelayedOperationPurgatory[DelayedDeleteRecords], @@ -150,9 +151,10 @@ class ReplicaManager(val config: KafkaConfig, quotaManager: ReplicationQuotaManager, brokerTopicStats: BrokerTopicStats, metadataCache: MetadataCache, + logDirFailureChannel: LogDirFailureChannel, threadNamePrefix: Option[String] = None) { this(config, metrics, time, zkUtils, scheduler, logManager, isShuttingDown, - quotaManager, brokerTopicStats, metadataCache, + quotaManager, brokerTopicStats, metadataCache, logDirFailureChannel, DelayedOperationPurgatory[DelayedProduce]( purgatoryName = "Produce", brokerId = config.brokerId, purgeInterval = config.producerPurgatoryPurgeIntervalRequests), @@ -173,13 +175,24 @@ class ReplicaManager(val config: KafkaConfig, private val replicaStateChangeLock = new Object val replicaFetcherManager = createReplicaFetcherManager(metrics, time, threadNamePrefix, quotaManager) private val highWatermarkCheckPointThreadStarted = new AtomicBoolean(false) - val highWatermarkCheckpoints = config.logDirs.map(dir => (new File(dir).getAbsolutePath, new OffsetCheckpointFile(new File(dir, ReplicaManager.HighWatermarkFilename)))).toMap + @volatile var highWatermarkCheckpoints = logManager.liveLogDirs.map(dir => + (dir.getAbsolutePath, new OffsetCheckpointFile(new File(dir.getAbsolutePath, ReplicaManager.HighWatermarkFilename)))).toMap + private var hwThreadInitialized = false this.logIdent = "[Replica Manager on Broker " + localBrokerId + "]: " val stateChangeLogger = KafkaController.stateChangeLogger private val isrChangeSet: mutable.Set[TopicPartition] = new mutable.HashSet[TopicPartition]() private val lastIsrChangeMs = new AtomicLong(System.currentTimeMillis()) private val lastIsrPropagationMs = new AtomicLong(System.currentTimeMillis()) + private var logDirFailureHandler: LogDirFailureHandler = null + + private class LogDirFailureHandler(name: String) extends ShutdownableThread(name) { + override def doWork() { + val newOfflineLogDir = logDirFailureChannel.takeNextLogFailureEvent(300) + if (newOfflineLogDir != null) + handleLogDirFailure(newOfflineLogDir) + } + } val leaderCount = newGauge( "LeaderCount", @@ -277,6 +290,8 @@ class ReplicaManager(val config: KafkaConfig, // A follower can lag behind leader for up to config.replicaLagTimeMaxMs x 1.5 before it is removed from ISR scheduler.schedule("isr-expiration", maybeShrinkIsr _, period = config.replicaLagTimeMaxMs / 2, unit = TimeUnit.MILLISECONDS) scheduler.schedule("isr-change-propagation", maybePropagateIsrChanges _, period = 2500L, unit = TimeUnit.MILLISECONDS) + logDirFailureHandler = new LogDirFailureHandler("LogDirFailureHandler") + logDirFailureHandler.start() } def stopReplica(topicPartition: TopicPartition, deletePartition: Boolean): Errors = { @@ -287,18 +302,21 @@ class ReplicaManager(val config: KafkaConfig, if (deletePartition) { val removedPartition = allPartitions.remove(topicPartition) if (removedPartition != null) { - removedPartition.delete() // this will delete the local log val topicHasPartitions = allPartitions.keys.exists(tp => topicPartition.topic == tp.topic) if (!topicHasPartitions) brokerTopicStats.removeMetrics(topicPartition.topic) + // this will delete the local log. This call may throw exception if the log is on offline directory + removedPartition.delete() + } else { + logManager.asyncDelete(topicPartition) } } case None => // Delete log and corresponding folders in case replica manager doesn't hold them anymore. // This could happen when topic is being deleted while broker is down and recovers. - if (deletePartition && logManager.getLog(topicPartition).isDefined) + if (deletePartition) logManager.asyncDelete(topicPartition) - stateChangeLogger.trace(s"Broker $localBrokerId ignoring stop replica (delete=$deletePartition) for partition $topicPartition as replica doesn't exist on broker") + stateChangeLogger.trace(s"Broker $localBrokerId ignoring stop replica (delete=$deletePartition) for partition $topicPartition as replica either doesn't exist on broker") } stateChangeLogger.trace(s"Broker $localBrokerId finished handling stop replica (delete=$deletePartition) for partition $topicPartition") error @@ -317,8 +335,16 @@ class ReplicaManager(val config: KafkaConfig, // First stop fetchers for all partitions, then stop the corresponding replicas replicaFetcherManager.removeFetcherForPartitions(partitions) for (topicPartition <- partitions){ - val error = stopReplica(topicPartition, stopReplicaRequest.deletePartitions) - responseMap.put(topicPartition, error) + try { + val errorCode = stopReplica(topicPartition, stopReplicaRequest.deletePartitions) + responseMap.put(topicPartition, errorCode) + } catch { + case e@ (_: KafkaStorageException | _: IOException) => + stateChangeLogger.error(s"Broker $localBrokerId ignoring stop replica (delete=${stopReplicaRequest.deletePartitions}) for partition $topicPartition due to storage exception", e) + error("Error stopping replicas of partition %s".format(topicPartition), e) + getLogDir(topicPartition).foreach(logDirFailureChannel.maybeAddLogFailureEvent) + responseMap.put(topicPartition, Errors.KAFKA_STORAGE_ERROR) + } } (responseMap, Errors.NONE) } @@ -356,6 +382,13 @@ class ReplicaManager(val config: KafkaConfig, def getReplica(tp: TopicPartition): Option[Replica] = getReplica(tp, localBrokerId) + def getLogDir(topicPartition: TopicPartition): Option[String] = { + getReplica(topicPartition).flatMap(_.log) match { + case Some(log) => Some(log.dir.getParent) + case None => None + } + } + /** * Append messages to leader replicas of the partition, and wait for them to be replicated to other replicas; * the callback function will be triggered either when timeout or the required acks are satisfied; @@ -443,13 +476,10 @@ class ReplicaManager(val config: KafkaConfig, } catch { // NOTE: Failed produce requests metric is not incremented for known exceptions // it is supposed to indicate un-expected failures of a broker in handling a produce request - case e: KafkaStorageException => - fatal("Halting due to unrecoverable I/O error while handling DeleteRecordsRequest: ", e) - Runtime.getRuntime.halt(1) - (topicPartition, null) case e@ (_: UnknownTopicOrPartitionException | _: NotLeaderForPartitionException | _: OffsetOutOfRangeException | + _: KafkaStorageException | _: PolicyViolationException | _: NotEnoughReplicasException) => (topicPartition, LogDeleteRecordsResult(-1L, -1L, Some(e))) @@ -543,7 +573,14 @@ class ReplicaManager(val config: KafkaConfig, val partitionOpt = getPartition(topicPartition) val info = partitionOpt match { case Some(partition) => - partition.appendRecordsToLeader(records, isFromClient, requiredAcks) + try { + partition.appendRecordsToLeader(records, isFromClient, requiredAcks) + } catch { + case e: KafkaStorageException => + error("Error processing append operation on partition %s".format(topicPartition), e) + getLogDir(topicPartition).foreach(maybeAddLogFailureEvent) + throw e + } case None => throw new UnknownTopicOrPartitionException("Partition %s doesn't exist on %d" .format(topicPartition, localBrokerId)) @@ -567,15 +604,12 @@ class ReplicaManager(val config: KafkaConfig, } catch { // NOTE: Failed produce requests metric is not incremented for known exceptions // it is supposed to indicate un-expected failures of a broker in handling a produce request - case e: KafkaStorageException => - fatal("Halting due to unrecoverable I/O error while handling produce request: ", e) - Exit.halt(1) - (topicPartition, null) case e@ (_: UnknownTopicOrPartitionException | _: NotLeaderForPartitionException | _: RecordTooLargeException | _: RecordBatchTooLargeException | _: CorruptRecordException | + _: KafkaStorageException | _: InvalidTimestampException) => (topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(e))) case t: Throwable => @@ -713,19 +747,29 @@ class ReplicaManager(val config: KafkaConfig, val fetchTimeMs = time.milliseconds val logReadInfo = localReplica.log match { case Some(log) => - val adjustedFetchSize = math.min(partitionFetchSize, limitBytes) - - // Try the read first, this tells us whether we need all of adjustedFetchSize for this partition - val fetch = log.read(offset, adjustedFetchSize, maxOffsetOpt, minOneMessage, isolationLevel) - - // If the partition is being throttled, simply return an empty set. - if (shouldLeaderThrottle(quota, tp, replicaId)) - FetchDataInfo(fetch.fetchOffsetMetadata, MemoryRecords.EMPTY) - // For FetchRequest version 3, we replace incomplete message sets with an empty one as consumers can make - // progress in such cases and don't need to report a `RecordTooLargeException` - else if (!hardMaxBytesLimit && fetch.firstEntryIncomplete) - FetchDataInfo(fetch.fetchOffsetMetadata, MemoryRecords.EMPTY) - else fetch + try { + val adjustedFetchSize = math.min(partitionFetchSize, limitBytes) + + // Try the read first, this tells us whether we need all of adjustedFetchSize for this partition + val fetch = log.read(offset, adjustedFetchSize, maxOffsetOpt, minOneMessage, isolationLevel) + + // If the partition is being throttled, simply return an empty set. + if (shouldLeaderThrottle(quota, tp, replicaId)) + FetchDataInfo(fetch.fetchOffsetMetadata, MemoryRecords.EMPTY) + // For FetchRequest version 3, we replace incomplete message sets with an empty one as consumers can make + // progress in such cases and don't need to report a `RecordTooLargeException` + else if (!hardMaxBytesLimit && fetch.firstEntryIncomplete) + FetchDataInfo(fetch.fetchOffsetMetadata, MemoryRecords.EMPTY) + else fetch + } catch { + case e@ (_: IOException | _: KafkaStorageException) => + error(s"Error processing fetch operation on partition ${tp}, offset $offset", e) + getLogDir(tp).foreach(maybeAddLogFailureEvent) + e match { + case storageException: KafkaStorageException => throw storageException + case _ => throw new KafkaStorageException(s"Error processing fetch operation on partition ${tp}, offset $offset", e) + } + } case None => error(s"Leader for partition $tp does not have a local log") @@ -747,6 +791,7 @@ class ReplicaManager(val config: KafkaConfig, case e@ (_: UnknownTopicOrPartitionException | _: NotLeaderForPartitionException | _: ReplicaNotAvailableException | + _: KafkaStorageException | _: OffsetOutOfRangeException) => LogReadResult(info = FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MemoryRecords.EMPTY), highWatermark = -1L, @@ -819,7 +864,8 @@ class ReplicaManager(val config: KafkaConfig, } } - def becomeLeaderOrFollower(correlationId: Int,leaderAndISRRequest: LeaderAndIsrRequest, + def becomeLeaderOrFollower(correlationId: Int, + leaderAndISRRequest: LeaderAndIsrRequest, onLeadershipChange: (Iterable[Partition], Iterable[Partition]) => Unit): BecomeLeaderOrFollowerResult = { leaderAndISRRequest.partitionStates.asScala.foreach { case (topicPartition, stateInfo) => stateChangeLogger.trace("Broker %d received LeaderAndIsr request %s correlation id %d from controller %d epoch %d for partition [%s,%d]" @@ -878,6 +924,12 @@ class ReplicaManager(val config: KafkaConfig, else Set.empty[Partition] + leaderAndISRRequest.partitionStates.asScala.keys.foreach( topicPartition => + // Remove the partition from cache if local replica can not be created due to KafkaStorageException + if (getReplica(topicPartition).isEmpty) + allPartitions.remove(topicPartition) + ) + // we initialize highwatermark thread after the first leaderisrrequest. This ensures that all the partitions // have been completely populated before starting the checkpointing there by avoiding weird race conditions if (!hwThreadInitialized) { @@ -885,8 +937,8 @@ class ReplicaManager(val config: KafkaConfig, hwThreadInitialized = true } replicaFetcherManager.shutdownIdleFetcherThreads() - onLeadershipChange(partitionsBecomeLeader, partitionsBecomeFollower) + BecomeLeaderOrFollowerResult(responseMap, Errors.NONE) } } @@ -926,18 +978,26 @@ class ReplicaManager(val config: KafkaConfig, replicaFetcherManager.removeFetcherForPartitions(partitionState.keySet.map(_.topicPartition)) // Update the partition information to be the leader partitionState.foreach{ case (partition, partitionStateInfo) => - if (partition.makeLeader(controllerId, partitionStateInfo, correlationId)) - partitionsToMakeLeaders += partition - else - stateChangeLogger.info(("Broker %d skipped the become-leader state change after marking its partition as leader with correlation id %d from " + - "controller %d epoch %d for partition %s since it is already the leader for the partition.") - .format(localBrokerId, correlationId, controllerId, epoch, partition.topicPartition)) - } - partitionsToMakeLeaders.foreach { partition => - stateChangeLogger.trace(("Broker %d stopped fetchers as part of become-leader request from controller " + - "%d epoch %d with correlation id %d for partition %s") - .format(localBrokerId, controllerId, epoch, correlationId, partition.topicPartition)) + try { + if (partition.makeLeader(controllerId, partitionStateInfo, correlationId)) { + partitionsToMakeLeaders += partition + stateChangeLogger.trace(("Broker %d stopped fetchers as part of become-leader request from controller " + + "%d epoch %d with correlation id %d for partition %s") + .format(localBrokerId, controllerId, epoch, correlationId, partition.topicPartition)) + } else + stateChangeLogger.info(("Broker %d skipped the become-leader state change after marking its partition as leader with correlation id %d from " + + "controller %d epoch %d for partition %s since it is already the leader for the partition.") + .format(localBrokerId, correlationId, controllerId, epoch, partition.topicPartition)) + } catch { + case e@ (_: KafkaStorageException | _: IOException) => + stateChangeLogger.error(("Broker %d skipped the become-leader state change with correlation id %d from " + + "controller %d epoch %d for partition %s since the replica for the partition is offline due to disk error %s.") + .format(localBrokerId, correlationId, controllerId, epoch, partition.topicPartition, e)) + getLogDir(new TopicPartition(partition.topic, partition.partitionId)).foreach(maybeAddLogFailureEvent) + responseMap.put(new TopicPartition(partition.topic, partition.partitionId), Errors.KAFKA_STORAGE_ERROR) + } } + } catch { case e: Throwable => partitionState.keys.foreach { partition => @@ -996,27 +1056,37 @@ class ReplicaManager(val config: KafkaConfig, // TODO: Delete leaders from LeaderAndIsrRequest partitionState.foreach{ case (partition, partitionStateInfo) => - val newLeaderBrokerId = partitionStateInfo.leader - metadataCache.getAliveBrokers.find(_.id == newLeaderBrokerId) match { - // Only change partition state when the leader is available - case Some(_) => - if (partition.makeFollower(controllerId, partitionStateInfo, correlationId)) - partitionsToMakeFollower += partition - else - stateChangeLogger.info(("Broker %d skipped the become-follower state change after marking its partition as follower with correlation id %d from " + - "controller %d epoch %d for partition %s since the new leader %d is the same as the old leader") + try { + val newLeaderBrokerId = partitionStateInfo.leader + metadataCache.getAliveBrokers.find(_.id == newLeaderBrokerId) match { + // Only change partition state when the leader is available + case Some(_) => + if (partition.makeFollower(controllerId, partitionStateInfo, correlationId)) + partitionsToMakeFollower += partition + else + stateChangeLogger.info(("Broker %d skipped the become-follower state change after marking its partition as follower with correlation id %d from " + + "controller %d epoch %d for partition %s since the new leader %d is the same as the old leader") + .format(localBrokerId, correlationId, controllerId, partitionStateInfo.controllerEpoch, + partition.topicPartition, newLeaderBrokerId)) + case None => + // The leader broker should always be present in the metadata cache. + // If not, we should record the error message and abort the transition process for this partition + stateChangeLogger.error(("Broker %d received LeaderAndIsrRequest with correlation id %d from controller" + + " %d epoch %d for partition %s but cannot become follower since the new leader %d is unavailable.") .format(localBrokerId, correlationId, controllerId, partitionStateInfo.controllerEpoch, - partition.topicPartition, newLeaderBrokerId)) - case None => - // The leader broker should always be present in the metadata cache. - // If not, we should record the error message and abort the transition process for this partition - stateChangeLogger.error(("Broker %d received LeaderAndIsrRequest with correlation id %d from controller" + - " %d epoch %d for partition %s but cannot become follower since the new leader %d is unavailable.") + partition.topicPartition, newLeaderBrokerId)) + // Create the local replica even if the leader is unavailable. This is required to ensure that we include + // the partition's high watermark in the checkpoint file (see KAFKA-1647) + partition.getOrCreateReplica(isNew = partitionStateInfo.isNew) + } + } catch { + case e@ (_: KafkaStorageException | _: IOException) => + stateChangeLogger.error(("Broker %d skipped the become-follower state change with correlation id %d from " + + "controller %d epoch %d for partition [%s,%d] since the replica for the partition is offline due to disk error %s") .format(localBrokerId, correlationId, controllerId, partitionStateInfo.controllerEpoch, - partition.topicPartition, newLeaderBrokerId)) - // Create the local replica even if the leader is unavailable. This is required to ensure that we include - // the partition's high watermark in the checkpoint file (see KAFKA-1647) - partition.getOrCreateReplica() + partition.topic, partition.partitionId, e)) + getLogDir(new TopicPartition(partition.topic, partition.partitionId)).foreach(maybeAddLogFailureEvent) + responseMap.put(new TopicPartition(partition.topic, partition.partitionId), Errors.KAFKA_STORAGE_ERROR) } } @@ -1115,18 +1185,61 @@ class ReplicaManager(val config: KafkaConfig, for ((dir, reps) <- replicasByDir) { val hwms = reps.map(r => r.partition.topicPartition -> r.highWatermark.messageOffset).toMap try { - highWatermarkCheckpoints(dir).write(hwms) + highWatermarkCheckpoints.get(dir).foreach(_.write(hwms)) } catch { case e: IOException => - fatal("Error writing to highwatermark file: ", e) - Exit.halt(1) + error("Error writing to highwatermark file: ", e) + maybeAddLogFailureEvent(dir) } } } + def maybeAddLogFailureEvent(logDir: String): Unit = { + logDirFailureChannel.maybeAddLogFailureEvent(logDir) + } + + def handleLogDirFailure(dir: String) { + if (!logManager.isLogDirOnline(dir)) + return + + info(s"Stopping serving replicas in dir $dir") + replicaStateChangeLock synchronized { + val newOfflinePartitions = allPartitions.values.filter { partition => + partition.getReplica(config.brokerId) match { + case Some(replica) => + replica.log.isDefined && replica.log.get.dir.getParentFile.getAbsolutePath == dir + case None => false + } + }.map(_.topicPartition) + + info(s"Partitions ${newOfflinePartitions.mkString(",")} are offline due to failure on log directory $dir") + + newOfflinePartitions.foreach { topicPartition => + val partition = allPartitions.remove(topicPartition) + partition.removePartitionMetrics() + } + + newOfflinePartitions.map(_.topic).toSet.foreach { topic: String => + val topicHasPartitions = allPartitions.keys.exists(tp => topic == tp.topic) + if (!topicHasPartitions) + brokerTopicStats.removeMetrics(topic) + } + + replicaFetcherManager.removeFetcherForPartitions(newOfflinePartitions.toSet) + highWatermarkCheckpoints = highWatermarkCheckpoints.filterKeys(_ != dir) + info("Broker %d stopped fetcher for partitions %s because they are in the failed log dir %s" + .format(localBrokerId, newOfflinePartitions.mkString(", "), dir)) + } + logManager.handleLogDirFailure(dir) + LogDirUtils.propagateLogDirEvent(zkUtils, localBrokerId) + info(s"Stopped serving replicas in dir $dir") + } + // High watermark do not need to be checkpointed only when under unit tests def shutdown(checkpointHW: Boolean = true) { info("Shutting down") + if (logDirFailureHandler != null) + logDirFailureHandler.shutdown() replicaFetcherManager.shutdown() delayedFetchPurgatory.shutdown() delayedProducePurgatory.shutdown() diff --git a/core/src/main/scala/kafka/server/checkpoints/CheckpointFile.scala b/core/src/main/scala/kafka/server/checkpoints/CheckpointFile.scala index cc50620639478..da49012515ac3 100644 --- a/core/src/main/scala/kafka/server/checkpoints/CheckpointFile.scala +++ b/core/src/main/scala/kafka/server/checkpoints/CheckpointFile.scala @@ -33,7 +33,7 @@ class CheckpointFile[T](val file: File, version: Int, formatter: CheckpointFileF private val path = file.toPath.toAbsolutePath private val tempPath = Paths.get(path.toString + ".tmp") private val lock = new Object() - + try Files.createFile(file.toPath) // create the file if it doesn't exist catch { case _: FileAlreadyExistsException => } diff --git a/core/src/main/scala/kafka/utils/LogDirUtils.scala b/core/src/main/scala/kafka/utils/LogDirUtils.scala new file mode 100644 index 0000000000000..05bff96cf1b0d --- /dev/null +++ b/core/src/main/scala/kafka/utils/LogDirUtils.scala @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.utils + +import kafka.controller.{LogDirEventNotificationListener} +import scala.collection.{Map, Seq, Set} + +object LogDirUtils extends Logging { + + private val LogDirEventNotificationPrefix = "log_dir_event_" + val LogDirFailureEvent = 1 + + def propagateLogDirEvent(zkUtils: ZkUtils, brokerId: Int) { + val logDirEventNotificationPath: String = zkUtils.createSequentialPersistentPath( + ZkUtils.LogDirEventNotificationPath + "/" + LogDirEventNotificationPrefix, logDirFailureEventZkData(brokerId)) + debug("Added " + logDirEventNotificationPath + " for broker " + brokerId) + } + + private def logDirFailureEventZkData(brokerId: Int): String = { + Json.encode(Map("version" -> LogDirEventNotificationListener.version, "broker" -> brokerId, "event" -> LogDirFailureEvent)) + } + + def deleteLogDirEvents(zkUtils: ZkUtils) { + val sequenceNumbers = zkUtils.getChildrenParentMayNotExist(ZkUtils.LogDirEventNotificationPath).toSet + sequenceNumbers.map(x => zkUtils.deletePath(ZkUtils.LogDirEventNotificationPath + "/" + x)) + } + + def getBrokerIdFromLogDirEvent(zkUtils: ZkUtils, child: String): Option[Int] = { + val changeZnode = ZkUtils.LogDirEventNotificationPath + "/" + child + val (jsonOpt, stat) = zkUtils.readDataMaybeNull(changeZnode) + if (jsonOpt.isDefined) { + val json = Json.parseFull(jsonOpt.get) + + json match { + case Some(m) => + val brokerAndTopics = m.asInstanceOf[Map[String, Any]] + val brokerId = brokerAndTopics.get("broker").get.asInstanceOf[Int] + Some(brokerId) + case None => + error("Invalid topic and partition JSON: " + jsonOpt.get + " in ZK: " + changeZnode) + None + } + } else { + None + } + } + +} diff --git a/core/src/main/scala/kafka/utils/ZkUtils.scala b/core/src/main/scala/kafka/utils/ZkUtils.scala index 0035120e74db6..7d3529fc18bc6 100644 --- a/core/src/main/scala/kafka/utils/ZkUtils.scala +++ b/core/src/main/scala/kafka/utils/ZkUtils.scala @@ -51,6 +51,7 @@ object ZkUtils { val ControllerPath = "/controller" val ControllerEpochPath = "/controller_epoch" val IsrChangeNotificationPath = "/isr_change_notification" + val LogDirEventNotificationPath = "/log_dir_event_notification" val KafkaAclPath = "/kafka-acl" val KafkaAclChangesPath = "/kafka-acl-changes" @@ -75,7 +76,8 @@ object ZkUtils { IsrChangeNotificationPath, KafkaAclPath, KafkaAclChangesPath, - ProducerIdBlockPath) + ProducerIdBlockPath, + LogDirEventNotificationPath) // Important: it is necessary to add any new top level Zookeeper path that contains // sensitive information that should not be world readable to the Seq @@ -235,7 +237,8 @@ class ZkUtils(val zkClient: ZkClient, DeleteTopicsPath, BrokerSequenceIdPath, IsrChangeNotificationPath, - ProducerIdBlockPath) + ProducerIdBlockPath, + LogDirEventNotificationPath) // Visible for testing val zkPath = new ZkPath(zkClient) diff --git a/core/src/test/scala/integration/kafka/api/AuthorizerIntegrationTest.scala b/core/src/test/scala/integration/kafka/api/AuthorizerIntegrationTest.scala index 09ff9be80d816..2b134fe1fa30b 100644 --- a/core/src/test/scala/integration/kafka/api/AuthorizerIntegrationTest.scala +++ b/core/src/test/scala/integration/kafka/api/AuthorizerIntegrationTest.scala @@ -44,7 +44,6 @@ import org.apache.kafka.common.security.auth.KafkaPrincipal import org.apache.kafka.common.{Node, TopicPartition, requests} import org.junit.Assert._ import org.junit.{After, Assert, Before, Test} - import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.Buffer @@ -272,7 +271,8 @@ class AuthorizerIntegrationTest extends BaseRequestTest { } private def createUpdateMetadataRequest = { - val partitionState = Map(tp -> new PartitionState(Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava)).asJava + val partitionState = Map(tp -> new UpdateMetadataRequest.PartitionState( + Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava, Seq.empty[Integer].asJava)).asJava val securityProtocol = SecurityProtocol.PLAINTEXT val brokers = Set(new requests.UpdateMetadataRequest.Broker(brokerId, Seq(new requests.UpdateMetadataRequest.EndPoint("localhost", 0, securityProtocol, @@ -303,8 +303,8 @@ class AuthorizerIntegrationTest extends BaseRequestTest { private def leaveGroupRequest = new LeaveGroupRequest.Builder(group, "").build() private def leaderAndIsrRequest = { - new requests.LeaderAndIsrRequest.Builder(brokerId, Int.MaxValue, - Map(tp -> new PartitionState(Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava)).asJava, + new requests.LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, brokerId, Int.MaxValue, + Map(tp -> new PartitionState(Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava, false)).asJava, Set(new Node(brokerId, "localhost", 0)).asJava).build() } diff --git a/core/src/test/scala/integration/kafka/api/IntegrationTestHarness.scala b/core/src/test/scala/integration/kafka/api/IntegrationTestHarness.scala index 921c2b4bef14c..5e3c7abca3955 100644 --- a/core/src/test/scala/integration/kafka/api/IntegrationTestHarness.scala +++ b/core/src/test/scala/integration/kafka/api/IntegrationTestHarness.scala @@ -37,6 +37,7 @@ abstract class IntegrationTestHarness extends KafkaServerTestHarness { val producerCount: Int val consumerCount: Int val serverCount: Int + var logDirCount: Int = 1 lazy val producerConfig = new Properties lazy val consumerConfig = new Properties lazy val serverConfig = new Properties @@ -46,7 +47,7 @@ abstract class IntegrationTestHarness extends KafkaServerTestHarness { override def generateConfigs = { val cfgs = TestUtils.createBrokerConfigs(serverCount, zkConnect, interBrokerSecurityProtocol = Some(securityProtocol), - trustStoreFile = trustStoreFile, saslProperties = serverSaslProperties) + trustStoreFile = trustStoreFile, saslProperties = serverSaslProperties, logDirCount = logDirCount) cfgs.foreach { config => config.setProperty(KafkaConfig.ListenersProp, s"${listenerName.value}://localhost:${TestUtils.RandomPort}") config.remove(KafkaConfig.InterBrokerSecurityProtocolProp) @@ -84,7 +85,7 @@ abstract class IntegrationTestHarness extends KafkaServerTestHarness { saslProperties = this.clientSaslProperties, props = Some(producerConfig)) } - + def createNewConsumer: KafkaConsumer[Array[Byte], Array[Byte]] = { TestUtils.createNewConsumer(brokerList, securityProtocol = this.securityProtocol, diff --git a/core/src/test/scala/integration/kafka/api/LogDirFailureTest.scala b/core/src/test/scala/integration/kafka/api/LogDirFailureTest.scala new file mode 100644 index 0000000000000..b422658d814ce --- /dev/null +++ b/core/src/test/scala/integration/kafka/api/LogDirFailureTest.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.api + +import java.util.Collections +import java.util.concurrent.{ExecutionException, TimeUnit} +import org.apache.kafka.common.errors.NotLeaderForPartitionException +import kafka.server.KafkaConfig +import kafka.utils.{CoreUtils, TestUtils} +import org.apache.kafka.clients.consumer.KafkaConsumer +import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord} +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.errors.UnknownTopicOrPartitionException +import org.apache.kafka.common.utils.Utils +import org.junit.{Before, Test} +import org.junit.Assert.assertTrue + +/** + * Test whether clients can producer and consume when there is log directory failure + */ +class LogDirFailureTest extends IntegrationTestHarness { + val producerCount: Int = 1 + val consumerCount: Int = 1 + val serverCount: Int = 2 + private val topic = "topic" + + this.logDirCount = 2 + this.producerConfig.setProperty(ProducerConfig.RETRIES_CONFIG, "0") + this.producerConfig.setProperty(ProducerConfig.METADATA_MAX_AGE_CONFIG, "100") + this.serverConfig.setProperty(KafkaConfig.ReplicaHighWatermarkCheckpointIntervalMsProp, "100") + + @Before + override def setUp() { + super.setUp() + TestUtils.createTopic(zkUtils, topic, 1, 2, servers = servers) + } + + @Test + def testProduceAfterLogDirFailure() { + + val consumer = consumers.head + subscribeAndWaitForAssignment(topic, consumer) + val producer = producers.head + val partition = new TopicPartition(topic, 0) + val record = new ProducerRecord(topic, 0, s"key".getBytes, s"value".getBytes) + + val leaderServerId = producer.partitionsFor(topic).get(0).leader().id() + val leaderServer = servers.find(_.config.brokerId == leaderServerId).get + + // The first send() should succeed + producer.send(record).get() + TestUtils.waitUntilTrue(() => { + consumer.poll(0).count() == 1 + }, "Expected the first message", 3000L) + + // Make log directory of the partition on the leader broker inaccessible by replacing it with a file + val replica = leaderServer.replicaManager.getReplica(partition) + val logDir = replica.get.log.get.dir.getParentFile + CoreUtils.swallow(Utils.delete(logDir)) + logDir.createNewFile() + assertTrue(logDir.isFile) + + // Wait for ReplicaHighWatermarkCheckpoint to happen so that the log directory of the topic will be offline + TestUtils.waitUntilTrue(() => !leaderServer.logManager.liveLogDirs.contains(logDir), "Expected log directory offline", 3000L) + assertTrue(leaderServer.replicaManager.getReplica(partition).isEmpty) + + // The second send() should fail due to UnknownTopicOrPartitionException + try { + producer.send(record).get(6000, TimeUnit.MILLISECONDS) + fail("send() should fail with either UnknownTopicOrPartitionException or NotLeaderForPartitionException") + } catch { + case e: ExecutionException => + e.getCause match { + case t: UnknownTopicOrPartitionException => + case t: NotLeaderForPartitionException => // This may happen if broker receives LeaderAndIsrRequest in between the first two ProduceRequest + case t: Throwable => + fail(s"send() should fail with either UnknownTopicOrPartitionException or NotLeaderForPartitionException instead of ${t.toString}") + } + case e: Throwable => fail(s"send() should fail with UnknownTopicOrPartitionException instead of ${e.toString}") + } + + // Wait for producer to update metadata for the partition + TestUtils.waitUntilTrue(() => { + producer.partitionsFor(topic).get(0).leader().id() != leaderServerId + }, "Expected new leader for the partition", 10000L) + + // The third send() should succeed + producer.send(record).get() + TestUtils.waitUntilTrue(() => { + consumer.poll(0).count() == 1 + }, "Expected the second message", 3000L) + } + + private def subscribeAndWaitForAssignment(topic: String, consumer: KafkaConsumer[Array[Byte], Array[Byte]]) { + consumer.subscribe(Collections.singletonList(topic)) + TestUtils.waitUntilTrue(() => { + consumer.poll(0) + !consumer.assignment.isEmpty + }, "Expected non-empty assignment") + } + +} diff --git a/core/src/test/scala/integration/kafka/api/TransactionsTest.scala b/core/src/test/scala/integration/kafka/api/TransactionsTest.scala index 0e57e53bc4dd1..760cc39a974fe 100644 --- a/core/src/test/scala/integration/kafka/api/TransactionsTest.scala +++ b/core/src/test/scala/integration/kafka/api/TransactionsTest.scala @@ -389,7 +389,7 @@ class TransactionsTest extends KafkaServerTestHarness { val recordMetadata = result.get() error(s"Missed a producer fenced exception when writing to ${recordMetadata.topic}-${recordMetadata.partition}. Grab the logs!!") servers.foreach { server => - error(s"log dirs: ${server.logManager.logDirs.map(_.getAbsolutePath).head}") + error(s"log dirs: ${server.logManager.liveLogDirs.map(_.getAbsolutePath).head}") } fail("Should not be able to send messages from a fenced producer.") } catch { @@ -436,7 +436,7 @@ class TransactionsTest extends KafkaServerTestHarness { val recordMetadata = result.get() error(s"Missed a producer fenced exception when writing to ${recordMetadata.topic}-${recordMetadata.partition}. Grab the logs!!") servers.foreach { case (server) => - error(s"log dirs: ${server.logManager.logDirs.map(_.getAbsolutePath).head}") + error(s"log dirs: ${server.logManager.liveLogDirs.map(_.getAbsolutePath).head}") } fail("Should not be able to send messages from a fenced producer.") } catch { diff --git a/core/src/test/scala/integration/kafka/server/ReplicaFetcherThreadFatalErrorTest.scala b/core/src/test/scala/integration/kafka/server/ReplicaFetcherThreadFatalErrorTest.scala index 147e84ac66e4e..ebe72234e03fb 100644 --- a/core/src/test/scala/integration/kafka/server/ReplicaFetcherThreadFatalErrorTest.scala +++ b/core/src/test/scala/integration/kafka/server/ReplicaFetcherThreadFatalErrorTest.scala @@ -111,7 +111,7 @@ class ReplicaFetcherThreadFatalErrorTest extends ZooKeeperTestHarness { override def createReplicaManager(isShuttingDown: AtomicBoolean): ReplicaManager = { new ReplicaManager(config, metrics, time, zkUtils, kafkaScheduler, logManager, isShuttingDown, - quotaManagers.follower, new BrokerTopicStats, metadataCache) { + quotaManagers.follower, new BrokerTopicStats, metadataCache, logDirFailureChannel) { override protected def createReplicaFetcherManager(metrics: Metrics, time: Time, threadNamePrefix: Option[String], quotaManager: ReplicationQuotaManager) = diff --git a/core/src/test/scala/unit/kafka/log/AbstractLogCleanerIntegrationTest.scala b/core/src/test/scala/unit/kafka/log/AbstractLogCleanerIntegrationTest.scala index bf361992c59cf..f8c81e58c8c2a 100644 --- a/core/src/test/scala/unit/kafka/log/AbstractLogCleanerIntegrationTest.scala +++ b/core/src/test/scala/unit/kafka/log/AbstractLogCleanerIntegrationTest.scala @@ -20,13 +20,13 @@ import java.io.File import java.nio.file.Files import java.util.Properties -import kafka.server.BrokerTopicStats +import kafka.server.{BrokerTopicStats, LogDirFailureChannel} import kafka.utils.{MockTime, Pool, TestUtils} import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.utils.Utils import org.junit.After -import scala.collection.mutable.ListBuffer +import scala.collection.mutable.{ArrayBuffer, ListBuffer} abstract class AbstractLogCleanerIntegrationTest { @@ -110,6 +110,7 @@ abstract class AbstractLogCleanerIntegrationTest { new LogCleaner(cleanerConfig, logDirs = Array(logDir), logs = logMap, + logDirFailureChannel = new LogDirFailureChannel(1), time = time) } } diff --git a/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala b/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala index 8a119c2f51aa5..e569b293a8379 100755 --- a/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala +++ b/core/src/test/scala/unit/kafka/log/LogCleanerIntegrationTest.scala @@ -5,7 +5,7 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/core/src/test/scala/unit/kafka/log/LogCleanerManagerTest.scala b/core/src/test/scala/unit/kafka/log/LogCleanerManagerTest.scala index b4c1790f2137f..1d1028d8c1ddc 100644 --- a/core/src/test/scala/unit/kafka/log/LogCleanerManagerTest.scala +++ b/core/src/test/scala/unit/kafka/log/LogCleanerManagerTest.scala @@ -29,6 +29,8 @@ import org.junit.Assert._ import org.junit.{After, Test} import org.scalatest.junit.JUnitSuite +import scala.collection.mutable.ArrayBuffer + /** * Unit tests for the log cleaning logic */ @@ -218,7 +220,7 @@ class LogCleanerManagerTest extends JUnitSuite with Logging { private def createCleanerManager(log: Log): LogCleanerManager = { val logs = new Pool[TopicPartition, Log]() logs.put(new TopicPartition("log", 0), log) - val cleanerManager = new LogCleanerManager(Array(logDir), logs) + val cleanerManager = new LogCleanerManager(Array(logDir), logs, null) cleanerManager } diff --git a/core/src/test/scala/unit/kafka/log/LogManagerTest.scala b/core/src/test/scala/unit/kafka/log/LogManagerTest.scala index 8b7819f030ba7..7baa312c8437e 100755 --- a/core/src/test/scala/unit/kafka/log/LogManagerTest.scala +++ b/core/src/test/scala/unit/kafka/log/LogManagerTest.scala @@ -24,6 +24,7 @@ import kafka.common._ import kafka.server.FetchDataInfo import kafka.server.checkpoints.OffsetCheckpointFile import kafka.utils._ +import kafka.zk.ZooKeeperTestHarness import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.errors.OffsetOutOfRangeException import org.apache.kafka.common.requests.IsolationLevel @@ -31,7 +32,7 @@ import org.apache.kafka.common.utils.Utils import org.junit.Assert._ import org.junit.{After, Before, Test} -class LogManagerTest { +class LogManagerTest extends ZooKeeperTestHarness { val time: MockTime = new MockTime() val maxRollInterval = 100 @@ -48,19 +49,21 @@ class LogManagerTest { val veryLargeLogFlushInterval = 10000000L @Before - def setUp() { + override def setUp() { + super.setUp() logDir = TestUtils.tempDir() logManager = createLogManager() logManager.startup() - logDir = logManager.logDirs(0) + logDir = logManager.liveLogDirs(0) } @After - def tearDown() { + override def tearDown() { + super.tearDown() if(logManager != null) logManager.shutdown() Utils.delete(logDir) - logManager.logDirs.foreach(Utils.delete) + logManager.liveLogDirs.foreach(Utils.delete) } /** @@ -68,7 +71,7 @@ class LogManagerTest { */ @Test def testCreateLog() { - val log = logManager.createLog(new TopicPartition(name, 0), logConfig) + val log = logManager.getOrCreateLog(new TopicPartition(name, 0), logConfig) val logFile = new File(logDir, name + "-0") assertTrue(logFile.exists) log.appendAsLeader(TestUtils.singletonRecords("test".getBytes()), leaderEpoch = 0) @@ -90,7 +93,7 @@ class LogManagerTest { */ @Test def testCleanupExpiredSegments() { - val log = logManager.createLog(new TopicPartition(name, 0), logConfig) + val log = logManager.getOrCreateLog(new TopicPartition(name, 0), logConfig) var offset = 0L for(_ <- 0 until 200) { val set = TestUtils.singletonRecords("test".getBytes()) @@ -135,7 +138,7 @@ class LogManagerTest { logManager.startup() // create a log - val log = logManager.createLog(new TopicPartition(name, 0), config) + val log = logManager.getOrCreateLog(new TopicPartition(name, 0), config) var offset = 0L // add a bunch of messages that should be larger than the retentionSize @@ -175,7 +178,7 @@ class LogManagerTest { def testDoesntCleanLogsWithCompactDeletePolicy() { val logProps = new Properties() logProps.put(LogConfig.CleanupPolicyProp, LogConfig.Compact + "," + LogConfig.Delete) - val log = logManager.createLog(new TopicPartition(name, 0), LogConfig.fromProps(logConfig.originals, logProps)) + val log = logManager.getOrCreateLog(new TopicPartition(name, 0), LogConfig.fromProps(logConfig.originals, logProps)) var offset = 0L for (_ <- 0 until 200) { val set = TestUtils.singletonRecords("test".getBytes(), key="test".getBytes()) @@ -204,7 +207,7 @@ class LogManagerTest { logManager = createLogManager() logManager.startup() - val log = logManager.createLog(new TopicPartition(name, 0), config) + val log = logManager.getOrCreateLog(new TopicPartition(name, 0), config) val lastFlush = log.lastFlushTime for (_ <- 0 until 200) { val set = TestUtils.singletonRecords("test".getBytes()) @@ -228,7 +231,7 @@ class LogManagerTest { // verify that logs are always assigned to the least loaded partition for(partition <- 0 until 20) { - logManager.createLog(new TopicPartition("test", partition), logConfig) + logManager.getOrCreateLog(new TopicPartition("test", partition), logConfig) assertEquals("We should have created the right number of logs", partition + 1, logManager.allLogs.size) val counts = logManager.allLogs.groupBy(_.dir.getParent).values.map(_.size) assertTrue("Load should balance evenly", counts.max <= counts.min + 1) @@ -264,7 +267,7 @@ class LogManagerTest { logManager.shutdown() logDir = TestUtils.tempDir() logManager = TestUtils.createLogManager( - logDirs = Array(new File(logDir.getAbsolutePath + File.separator))) + logDirs = Array(new File(logDir.getAbsolutePath + File.separator)), zkUtils = zkUtils) logManager.startup() verifyCheckpointRecovery(Seq(new TopicPartition("test-a", 1)), logManager) } @@ -286,7 +289,7 @@ class LogManagerTest { private def verifyCheckpointRecovery(topicPartitions: Seq[TopicPartition], logManager: LogManager) { - val logs = topicPartitions.map(this.logManager.createLog(_, logConfig)) + val logs = topicPartitions.map(this.logManager.getOrCreateLog(_, logConfig)) logs.foreach(log => { for (_ <- 0 until 50) log.appendAsLeader(TestUtils.singletonRecords("test".getBytes()), leaderEpoch = 0) @@ -294,7 +297,7 @@ class LogManagerTest { log.flush() }) - logManager.checkpointRecoveryPointOffsets() + logManager.checkpointLogRecoveryOffsets() val checkpoints = new OffsetCheckpointFile(new File(logDir, logManager.RecoveryPointCheckpointFile)).read() topicPartitions.zip(logs).foreach { @@ -306,6 +309,7 @@ class LogManagerTest { private def createLogManager(logDirs: Array[File] = Array(this.logDir)): LogManager = { TestUtils.createLogManager( + zkUtils = zkUtils, defaultConfig = logConfig, logDirs = logDirs, time = this.time) diff --git a/core/src/test/scala/unit/kafka/server/AbstractFetcherThreadTest.scala b/core/src/test/scala/unit/kafka/server/AbstractFetcherThreadTest.scala index 00cda21fb971d..5b0c8d2490af9 100644 --- a/core/src/test/scala/unit/kafka/server/AbstractFetcherThreadTest.scala +++ b/core/src/test/scala/unit/kafka/server/AbstractFetcherThreadTest.scala @@ -116,7 +116,7 @@ class AbstractFetcherThreadTest { override def handleOffsetOutOfRange(topicPartition: TopicPartition): Long = 0L - override def handlePartitionsWithErrors(partitions: Iterable[TopicPartition]): Unit = {} + override def handlePartitionsWithErrors(partitions: Map[TopicPartition, Option[Exception]]): Unit = {} override protected def fetch(fetchRequest: DummyFetchRequest): Seq[(TopicPartition, TestPartitionData)] = fetchRequest.offsets.mapValues(_ => new TestPartitionData()).toSeq @@ -211,7 +211,9 @@ class AbstractFetcherThreadTest { new DummyFetchRequest(requestMap) } - override def handlePartitionsWithErrors(partitions: Iterable[TopicPartition]) = delayPartitions(partitions, fetchBackOffMs.toLong) + override def handlePartitionsWithErrors(partitions: Map[TopicPartition, Option[Exception]]) { + delayPartitions(partitions.keys, fetchBackOffMs.toLong) + } } diff --git a/core/src/test/scala/unit/kafka/server/HighwatermarkPersistenceTest.scala b/core/src/test/scala/unit/kafka/server/HighwatermarkPersistenceTest.scala index b6b40c21f7963..8356d0d817ecf 100755 --- a/core/src/test/scala/unit/kafka/server/HighwatermarkPersistenceTest.scala +++ b/core/src/test/scala/unit/kafka/server/HighwatermarkPersistenceTest.scala @@ -24,7 +24,6 @@ import org.apache.kafka.common.utils.Utils import org.easymock.EasyMock import org.junit._ import org.junit.Assert._ -import kafka.common._ import kafka.cluster.Replica import kafka.utils.{KafkaScheduler, MockTime, TestUtils, ZkUtils} import java.util.concurrent.atomic.AtomicBoolean @@ -35,24 +34,29 @@ class HighwatermarkPersistenceTest { val configs = TestUtils.createBrokerConfigs(2, TestUtils.MockZkConnect).map(KafkaConfig.fromProps) val topic = "foo" + val zkUtils = EasyMock.createMock(classOf[ZkUtils]) val logManagers = configs map { config => TestUtils.createLogManager( logDirs = config.logDirs.map(new File(_)).toArray, + zkUtils = zkUtils, cleanerConfig = CleanerConfig()) } - + + val logDirFailureChannels = configs map { config => + new LogDirFailureChannel(config.logDirs.size) + } + @After def teardown() { - for(manager <- logManagers; dir <- manager.logDirs) + for(manager <- logManagers; dir <- manager.liveLogDirs) Utils.delete(dir) } @Test def testHighWatermarkPersistenceSinglePartition() { // mock zkclient - val zkUtils = EasyMock.createMock(classOf[ZkUtils]) EasyMock.replay(zkUtils) - + // create kafka scheduler val scheduler = new KafkaScheduler(2) scheduler.startup @@ -61,7 +65,7 @@ class HighwatermarkPersistenceTest { // create replica manager val replicaManager = new ReplicaManager(configs.head, metrics, time, zkUtils, scheduler, logManagers.head, new AtomicBoolean(false), QuotaFactory.instantiate(configs.head, metrics, time).follower, - new BrokerTopicStats, new MetadataCache(configs.head.brokerId)) + new BrokerTopicStats, new MetadataCache(configs.head.brokerId), logDirFailureChannels.head) replicaManager.startup() try { replicaManager.checkpointHighWatermarks() @@ -69,7 +73,7 @@ class HighwatermarkPersistenceTest { assertEquals(0L, fooPartition0Hw) val partition0 = replicaManager.getOrCreatePartition(new TopicPartition(topic, 0)) // create leader and follower replicas - val log0 = logManagers.head.createLog(new TopicPartition(topic, 0), LogConfig()) + val log0 = logManagers.head.getOrCreateLog(new TopicPartition(topic, 0), LogConfig()) val leaderReplicaPartition0 = new Replica(configs.head.brokerId, partition0, time, 0, Some(log0)) partition0.addReplicaIfNotExists(leaderReplicaPartition0) val followerReplicaPartition0 = new Replica(configs.last.brokerId, partition0, time) @@ -96,7 +100,6 @@ class HighwatermarkPersistenceTest { val topic1 = "foo1" val topic2 = "foo2" // mock zkclient - val zkUtils = EasyMock.createMock(classOf[ZkUtils]) EasyMock.replay(zkUtils) // create kafka scheduler val scheduler = new KafkaScheduler(2) @@ -106,7 +109,7 @@ class HighwatermarkPersistenceTest { // create replica manager val replicaManager = new ReplicaManager(configs.head, metrics, time, zkUtils, scheduler, logManagers.head, new AtomicBoolean(false), QuotaFactory.instantiate(configs.head, metrics, time).follower, - new BrokerTopicStats, new MetadataCache(configs.head.brokerId)) + new BrokerTopicStats, new MetadataCache(configs.head.brokerId), logDirFailureChannels.head) replicaManager.startup() try { replicaManager.checkpointHighWatermarks() @@ -114,7 +117,7 @@ class HighwatermarkPersistenceTest { assertEquals(0L, topic1Partition0Hw) val topic1Partition0 = replicaManager.getOrCreatePartition(new TopicPartition(topic1, 0)) // create leader log - val topic1Log0 = logManagers.head.createLog(new TopicPartition(topic1, 0), LogConfig()) + val topic1Log0 = logManagers.head.getOrCreateLog(new TopicPartition(topic1, 0), LogConfig()) // create a local replica for topic1 val leaderReplicaTopic1Partition0 = new Replica(configs.head.brokerId, topic1Partition0, time, 0, Some(topic1Log0)) topic1Partition0.addReplicaIfNotExists(leaderReplicaTopic1Partition0) @@ -130,7 +133,7 @@ class HighwatermarkPersistenceTest { // add another partition and set highwatermark val topic2Partition0 = replicaManager.getOrCreatePartition(new TopicPartition(topic2, 0)) // create leader log - val topic2Log0 = logManagers.head.createLog(new TopicPartition(topic2, 0), LogConfig()) + val topic2Log0 = logManagers.head.getOrCreateLog(new TopicPartition(topic2, 0), LogConfig()) // create a local replica for topic2 val leaderReplicaTopic2Partition0 = new Replica(configs.head.brokerId, topic2Partition0, time, 0, Some(topic2Log0)) topic2Partition0.addReplicaIfNotExists(leaderReplicaTopic2Partition0) @@ -163,5 +166,5 @@ class HighwatermarkPersistenceTest { replicaManager.highWatermarkCheckpoints(new File(replicaManager.config.logDirs.head).getAbsolutePath).read.getOrElse( new TopicPartition(topic, partition), 0L) } - + } diff --git a/core/src/test/scala/unit/kafka/server/ISRExpirationTest.scala b/core/src/test/scala/unit/kafka/server/ISRExpirationTest.scala index 5d221fe599e2d..683b34e61eef4 100644 --- a/core/src/test/scala/unit/kafka/server/ISRExpirationTest.scala +++ b/core/src/test/scala/unit/kafka/server/ISRExpirationTest.scala @@ -16,13 +16,13 @@ */ package kafka.server +import java.io.File import java.util.Properties import java.util.concurrent.atomic.AtomicBoolean import kafka.cluster.{Partition, Replica} import kafka.log.Log -import kafka.server.checkpoints.{LeaderEpochCheckpointFile, LeaderEpochFile} -import kafka.server.epoch.{LeaderEpochCache, LeaderEpochFileCache} +import kafka.server.epoch.LeaderEpochCache import kafka.utils._ import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.metrics.Metrics @@ -54,9 +54,13 @@ class IsrExpirationTest { @Before def setUp() { - replicaManager = new ReplicaManager(configs.head, metrics, time, null, null, null, new AtomicBoolean(false), - QuotaFactory.instantiate(configs.head, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(configs.head.brokerId)) + val logManager = EasyMock.createMock(classOf[kafka.log.LogManager]) + EasyMock.expect(logManager.liveLogDirs).andReturn(Array.empty[File]).anyTimes() + EasyMock.replay(logManager) + + replicaManager = new ReplicaManager(configs.head, metrics, time, null, null, logManager, new AtomicBoolean(false), + QuotaFactory.instantiate(configs.head, metrics, time).follower, new BrokerTopicStats, new MetadataCache(configs.head.brokerId), + new LogDirFailureChannel(configs.head.logDirs.size)) } @After diff --git a/core/src/test/scala/unit/kafka/server/LeaderElectionTest.scala b/core/src/test/scala/unit/kafka/server/LeaderElectionTest.scala index 3497cc3b5faf7..a5e82e2cff51a 100755 --- a/core/src/test/scala/unit/kafka/server/LeaderElectionTest.scala +++ b/core/src/test/scala/unit/kafka/server/LeaderElectionTest.scala @@ -145,10 +145,11 @@ class LeaderElectionTest extends ZooKeeperTestHarness { val partitionStates = Map( new TopicPartition(topic, partitionId) -> new PartitionState(2, brokerId2, LeaderAndIsr.initialLeaderEpoch, Seq(brokerId1, brokerId2).map(Integer.valueOf).asJava, LeaderAndIsr.initialZKVersion, - Seq(0, 1).map(Integer.valueOf).asJava) + Seq(0, 1).map(Integer.valueOf).asJava, false) ) + val version = ApiKeys.LEADER_AND_ISR.latestVersion val requestBuilder = new LeaderAndIsrRequest.Builder( - controllerId, staleControllerEpoch, partitionStates.asJava, nodes.toSet.asJava) + version, controllerId, staleControllerEpoch, partitionStates.asJava, nodes.toSet.asJava) controllerChannelManager.sendRequest(brokerId2, ApiKeys.LEADER_AND_ISR, requestBuilder, staleControllerEpochCallback) diff --git a/core/src/test/scala/unit/kafka/server/LogOffsetTest.scala b/core/src/test/scala/unit/kafka/server/LogOffsetTest.scala index 9383355b103b8..e053968e2c35b 100755 --- a/core/src/test/scala/unit/kafka/server/LogOffsetTest.scala +++ b/core/src/test/scala/unit/kafka/server/LogOffsetTest.scala @@ -181,7 +181,7 @@ class LogOffsetTest extends ZooKeeperTestHarness { AdminUtils.createTopic(zkUtils, topic, 3, 1) val logManager = server.getLogManager - val log = logManager.createLog(new TopicPartition(topic, part), logManager.defaultConfig) + val log = logManager.getOrCreateLog(new TopicPartition(topic, part), logManager.defaultConfig) for (_ <- 0 until 20) log.appendAsLeader(TestUtils.singletonRecords(value = Integer.toString(42).getBytes()), leaderEpoch = 0) @@ -210,7 +210,7 @@ class LogOffsetTest extends ZooKeeperTestHarness { AdminUtils.createTopic(zkUtils, topic, 3, 1) val logManager = server.getLogManager - val log = logManager.createLog(new TopicPartition(topic, part), logManager.defaultConfig) + val log = logManager.getOrCreateLog(new TopicPartition(topic, part), logManager.defaultConfig) for (_ <- 0 until 20) log.appendAsLeader(TestUtils.singletonRecords(value = Integer.toString(42).getBytes()), leaderEpoch = 0) log.flush() diff --git a/core/src/test/scala/unit/kafka/server/MetadataCacheTest.scala b/core/src/test/scala/unit/kafka/server/MetadataCacheTest.scala index d9fe99572a369..5535d1daae533 100644 --- a/core/src/test/scala/unit/kafka/server/MetadataCacheTest.scala +++ b/core/src/test/scala/unit/kafka/server/MetadataCacheTest.scala @@ -24,7 +24,7 @@ import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.network.ListenerName import org.apache.kafka.common.protocol.{ApiKeys, Errors, SecurityProtocol} import org.apache.kafka.common.requests.{PartitionState, UpdateMetadataRequest} -import org.apache.kafka.common.requests.UpdateMetadataRequest.{Broker, EndPoint} +import org.apache.kafka.common.requests.UpdateMetadataRequest.{Broker, EndPoint, PartitionState} import org.junit.Test import org.junit.Assert._ @@ -65,9 +65,9 @@ class MetadataCacheTest { }.toSet val partitionStates = Map( - new TopicPartition(topic0, 0) -> new PartitionState(controllerEpoch, 0, 0, asList(0, 1, 3), zkVersion, asList(0, 1, 3)), - new TopicPartition(topic0, 1) -> new PartitionState(controllerEpoch, 1, 1, asList(1, 0), zkVersion, asList(1, 2, 0, 4)), - new TopicPartition(topic1, 0) -> new PartitionState(controllerEpoch, 2, 2, asList(2, 1), zkVersion, asList(2, 1, 3))) + new TopicPartition(topic0, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, 0, 0, asList(0, 1, 3), zkVersion, asList(0, 1, 3), asList()), + new TopicPartition(topic0, 1) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, 1, 1, asList(1, 0), zkVersion, asList(1, 2, 0, 4), asList()), + new TopicPartition(topic1, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, 2, 2, asList(2, 1), zkVersion, asList(2, 1, 3), asList())) val version = ApiKeys.UPDATE_METADATA_KEY.latestVersion val updateMetadataRequest = new UpdateMetadataRequest.Builder(version, controllerId, controllerEpoch, @@ -125,7 +125,7 @@ class MetadataCacheTest { val leader = 1 val leaderEpoch = 1 val partitionStates = Map( - new TopicPartition(topic, 0) -> new PartitionState(controllerEpoch, leader, leaderEpoch, asList(0), zkVersion, asList(0))) + new TopicPartition(topic, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, leader, leaderEpoch, asList(0), zkVersion, asList(0), asList())) val version = ApiKeys.UPDATE_METADATA_KEY.latestVersion val updateMetadataRequest = new UpdateMetadataRequest.Builder(version, controllerId, controllerEpoch, @@ -169,7 +169,7 @@ class MetadataCacheTest { val isr = asList[Integer](0) val partitionStates = Map( - new TopicPartition(topic, 0) -> new PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas)) + new TopicPartition(topic, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas, asList())) val version = ApiKeys.UPDATE_METADATA_KEY.latestVersion val updateMetadataRequest = new UpdateMetadataRequest.Builder(version, controllerId, controllerEpoch, @@ -229,7 +229,7 @@ class MetadataCacheTest { val isr = asList[Integer](0, 1) val partitionStates = Map( - new TopicPartition(topic, 0) -> new PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas)) + new TopicPartition(topic, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, leader, leaderEpoch, isr, zkVersion, replicas, asList())) val version = ApiKeys.UPDATE_METADATA_KEY.latestVersion val updateMetadataRequest = new UpdateMetadataRequest.Builder(version, controllerId, controllerEpoch, @@ -282,7 +282,7 @@ class MetadataCacheTest { val replicas = asList[Integer](0) val isr = asList[Integer](0, 1) val partitionStates = Map( - new TopicPartition(topic, 0) -> new PartitionState(controllerEpoch, leader, leaderEpoch, isr, 3, replicas)) + new TopicPartition(topic, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, leader, leaderEpoch, isr, 3, replicas, asList())) val version = ApiKeys.UPDATE_METADATA_KEY.latestVersion val updateMetadataRequest = new UpdateMetadataRequest.Builder(version, 2, controllerEpoch, partitionStates.asJava, brokers.asJava).build() @@ -315,7 +315,7 @@ class MetadataCacheTest { val replicas = asList[Integer](0) val isr = asList[Integer](0, 1) val partitionStates = Map( - new TopicPartition(topic, 0) -> new PartitionState(controllerEpoch, leader, leaderEpoch, isr, 3, replicas)) + new TopicPartition(topic, 0) -> new UpdateMetadataRequest.PartitionState(controllerEpoch, leader, leaderEpoch, isr, 3, replicas, asList())) val version = ApiKeys.UPDATE_METADATA_KEY.latestVersion val updateMetadataRequest = new UpdateMetadataRequest.Builder(version, 2, controllerEpoch, partitionStates.asJava, brokers.asJava).build() diff --git a/core/src/test/scala/unit/kafka/server/ReplicaManagerQuotasTest.scala b/core/src/test/scala/unit/kafka/server/ReplicaManagerQuotasTest.scala index 2ee08a225aab7..483e7081df022 100644 --- a/core/src/test/scala/unit/kafka/server/ReplicaManagerQuotasTest.scala +++ b/core/src/test/scala/unit/kafka/server/ReplicaManagerQuotasTest.scala @@ -16,6 +16,7 @@ */ package kafka.server +import java.io.File import java.util.Properties import java.util.concurrent.atomic.AtomicBoolean @@ -178,11 +179,12 @@ class ReplicaManagerQuotasTest { //Return the same log for each partition as it doesn't matter expect(logManager.getLog(anyObject())).andReturn(Some(log)).anyTimes() + expect(logManager.liveLogDirs).andReturn(Array.empty[File]).anyTimes() replay(logManager) replicaManager = new ReplicaManager(configs.head, metrics, time, zkUtils, scheduler, logManager, new AtomicBoolean(false), QuotaFactory.instantiate(configs.head, metrics, time).follower, - new BrokerTopicStats, new MetadataCache(configs.head.brokerId)) + new BrokerTopicStats, new MetadataCache(configs.head.brokerId), new LogDirFailureChannel(configs.head.logDirs.size)) //create the two replicas for ((p, _) <- fetchInfo) { diff --git a/core/src/test/scala/unit/kafka/server/ReplicaManagerTest.scala b/core/src/test/scala/unit/kafka/server/ReplicaManagerTest.scala index 57948544e97e2..e64e8dea1f2b9 100644 --- a/core/src/test/scala/unit/kafka/server/ReplicaManagerTest.scala +++ b/core/src/test/scala/unit/kafka/server/ReplicaManagerTest.scala @@ -5,7 +5,7 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -25,9 +25,9 @@ import kafka.log.LogConfig import kafka.utils.{MockScheduler, MockTime, TestUtils, ZkUtils} import TestUtils.createBroker import kafka.utils.timer.MockTimer -import org.I0Itec.zkclient.ZkClient +import kafka.zk.ZooKeeperTestHarness import org.apache.kafka.common.metrics.Metrics -import org.apache.kafka.common.protocol.Errors +import org.apache.kafka.common.protocol.{ApiKeys, Errors} import org.apache.kafka.common.record._ import org.apache.kafka.common.requests.{IsolationLevel, LeaderAndIsrRequest, PartitionState} import org.apache.kafka.common.requests.ProduceResponse.PartitionResponse @@ -41,22 +41,20 @@ import org.junit.{After, Before, Test} import scala.collection.JavaConverters._ import scala.collection.Map -class ReplicaManagerTest { +class ReplicaManagerTest extends ZooKeeperTestHarness { val topic = "test-topic" val time = new MockTime val metrics = new Metrics - var zkClient : ZkClient = _ - var zkUtils : ZkUtils = _ - + @Before - def setUp() { - zkClient = EasyMock.createMock(classOf[ZkClient]) - zkUtils = ZkUtils(zkClient, isZkSecurityEnabled = false) + override def setUp() { + super.setUp() } - + @After - def tearDown() { + override def tearDown() { + super.tearDown() metrics.close() } @@ -64,10 +62,10 @@ class ReplicaManagerTest { def testHighWaterMarkDirectoryMapping() { val props = TestUtils.createBrokerConfig(1, TestUtils.MockZkConnect) val config = KafkaConfig.fromProps(props) - val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray) + val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, zkUtils) val rm = new ReplicaManager(config, metrics, time, zkUtils, new MockScheduler(time), mockLogMgr, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(config.brokerId)) + new MetadataCache(config.brokerId), new LogDirFailureChannel(config.logDirs.size)) try { val partition = rm.getOrCreatePartition(new TopicPartition(topic, 1)) partition.getOrCreateReplica(1) @@ -83,10 +81,10 @@ class ReplicaManagerTest { val props = TestUtils.createBrokerConfig(1, TestUtils.MockZkConnect) props.put("log.dir", TestUtils.tempRelativeDir("data").getAbsolutePath) val config = KafkaConfig.fromProps(props) - val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray) + val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, zkUtils) val rm = new ReplicaManager(config, metrics, time, zkUtils, new MockScheduler(time), mockLogMgr, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(config.brokerId)) + new MetadataCache(config.brokerId), new LogDirFailureChannel(config.logDirs.size)) try { val partition = rm.getOrCreatePartition(new TopicPartition(topic, 1)) partition.getOrCreateReplica(1) @@ -101,10 +99,10 @@ class ReplicaManagerTest { def testIllegalRequiredAcks() { val props = TestUtils.createBrokerConfig(1, TestUtils.MockZkConnect) val config = KafkaConfig.fromProps(props) - val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray) + val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, zkUtils) val rm = new ReplicaManager(config, metrics, time, zkUtils, new MockScheduler(time), mockLogMgr, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(config.brokerId), Option(this.getClass.getName)) + new MetadataCache(config.brokerId), new LogDirFailureChannel(config.logDirs.size), Option(this.getClass.getName)) try { def callback(responseStatus: Map[TopicPartition, PartitionResponse]) = { assert(responseStatus.values.head.error == Errors.INVALID_REQUIRED_ACKS) @@ -130,14 +128,15 @@ class ReplicaManagerTest { props.put("log.dir", TestUtils.tempRelativeDir("data").getAbsolutePath) val config = KafkaConfig.fromProps(props) val logProps = new Properties() - val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, LogConfig(logProps)) + logProps.put(LogConfig.MessageTimestampDifferenceMaxMsProp, Long.MaxValue.toString) + val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, zkUtils, LogConfig(logProps)) val aliveBrokers = Seq(createBroker(0, "host0", 0), createBroker(1, "host1", 1)) val metadataCache = EasyMock.createMock(classOf[MetadataCache]) EasyMock.expect(metadataCache.getAliveBrokers).andReturn(aliveBrokers).anyTimes() EasyMock.replay(metadataCache) val rm = new ReplicaManager(config, metrics, time, zkUtils, new MockScheduler(time), mockLogMgr, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - metadataCache) + metadataCache, new LogDirFailureChannel(config.logDirs.size)) try { val brokerList = Seq[Integer](0, 1).asJava @@ -145,8 +144,8 @@ class ReplicaManagerTest { val partition = rm.getOrCreatePartition(new TopicPartition(topic, 0)) partition.getOrCreateReplica(0) // Make this replica the leader. - val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(0, 0, - collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList)).asJava, + val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, + collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList, false)).asJava, Set(new Node(0, "host1", 0), new Node(1, "host2", 1)).asJava).build() rm.becomeLeaderOrFollower(0, leaderAndIsrRequest1, (_, _) => ()) rm.getLeaderReplicaIfLocal(new TopicPartition(topic, 0)) @@ -162,8 +161,8 @@ class ReplicaManagerTest { assertFalse(fetchResult.isFired) // Make this replica the follower - val leaderAndIsrRequest2 = new LeaderAndIsrRequest.Builder(0, 0, - collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 1, 1, brokerList, 0, brokerList)).asJava, + val leaderAndIsrRequest2 = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, + collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 1, 1, brokerList, 0, brokerList, false)).asJava, Set(new Node(0, "host1", 0), new Node(1, "host2", 1)).asJava).build() rm.becomeLeaderOrFollower(1, leaderAndIsrRequest2, (_, _) => ()) @@ -186,8 +185,8 @@ class ReplicaManagerTest { partition.getOrCreateReplica(0) // Make this replica the leader. - val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(0, 0, - collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList)).asJava, + val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, + collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList, true)).asJava, Set(new Node(0, "host1", 0), new Node(1, "host2", 1)).asJava).build() replicaManager.becomeLeaderOrFollower(0, leaderAndIsrRequest1, (_, _) => ()) replicaManager.getLeaderReplicaIfLocal(new TopicPartition(topic, 0)) @@ -271,14 +270,12 @@ class ReplicaManagerTest { try { val brokerList: java.util.List[Integer] = Seq[Integer](0, 1).asJava - val brokerSet: java.util.Set[Integer] = Set[Integer](0, 1).asJava - val partition = replicaManager.getOrCreatePartition(new TopicPartition(topic, 0)) partition.getOrCreateReplica(0) // Make this replica the leader. - val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(0, 0, - collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList)).asJava, + val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, + collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList, true)).asJava, Set(new Node(0, "host1", 0), new Node(1, "host2", 1)).asJava).build() replicaManager.becomeLeaderOrFollower(0, leaderAndIsrRequest1, (_, _) => ()) replicaManager.getLeaderReplicaIfLocal(new TopicPartition(topic, 0)) @@ -336,7 +333,8 @@ class ReplicaManagerTest { props.put("broker.id", Int.box(0)) val config = KafkaConfig.fromProps(props) val logProps = new Properties() - val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, LogConfig(logProps)) + logProps.put(LogConfig.MessageTimestampDifferenceMaxMsProp, Long.MaxValue.toString) + val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, zkUtils, LogConfig(logProps)) val aliveBrokers = Seq(createBroker(0, "host0", 0), createBroker(1, "host1", 1), createBroker(1, "host2", 2)) val metadataCache = EasyMock.createMock(classOf[MetadataCache]) EasyMock.expect(metadataCache.getAliveBrokers).andReturn(aliveBrokers).anyTimes() @@ -346,7 +344,7 @@ class ReplicaManagerTest { EasyMock.replay(metadataCache) val rm = new ReplicaManager(config, metrics, time, zkUtils, new MockScheduler(time), mockLogMgr, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - metadataCache, Option(this.getClass.getName)) + metadataCache, new LogDirFailureChannel(config.logDirs.size), Option(this.getClass.getName)) try { val brokerList = Seq[Integer](0, 1, 2).asJava @@ -355,8 +353,8 @@ class ReplicaManagerTest { partition.getOrCreateReplica(0) // Make this replica the leader. - val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(0, 0, - collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList)).asJava, + val leaderAndIsrRequest1 = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, + collection.immutable.Map(new TopicPartition(topic, 0) -> new PartitionState(0, 0, 0, brokerList, 0, brokerList, false)).asJava, Set(new Node(0, "host1", 0), new Node(1, "host2", 1), new Node(2, "host2", 2)).asJava).build() rm.becomeLeaderOrFollower(0, leaderAndIsrRequest1, (_, _) => ()) rm.getLeaderReplicaIfLocal(new TopicPartition(topic, 0)) @@ -481,7 +479,7 @@ class ReplicaManagerTest { props.put("broker.id", Int.box(0)) val config = KafkaConfig.fromProps(props) val logProps = new Properties() - val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, LogConfig(logProps)) + val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)).toArray, zkUtils, LogConfig(logProps)) val aliveBrokers = Seq(createBroker(0, "host0", 0), createBroker(1, "host1", 1)) val metadataCache = EasyMock.createMock(classOf[MetadataCache]) EasyMock.expect(metadataCache.getAliveBrokers).andReturn(aliveBrokers).anyTimes() @@ -498,7 +496,8 @@ class ReplicaManagerTest { new ReplicaManager(config, metrics, time, zkUtils, new MockScheduler(time), mockLogMgr, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - metadataCache, mockProducePurgatory, mockFetchPurgatory, mockDeleteRecordsPurgatory, Option(this.getClass.getName)) + metadataCache, new LogDirFailureChannel(config.logDirs.size), mockProducePurgatory, mockFetchPurgatory, + mockDeleteRecordsPurgatory, Option(this.getClass.getName)) } } diff --git a/core/src/test/scala/unit/kafka/server/RequestQuotaTest.scala b/core/src/test/scala/unit/kafka/server/RequestQuotaTest.scala index 7c171b0dfe5fa..ecb75ce587b16 100644 --- a/core/src/test/scala/unit/kafka/server/RequestQuotaTest.scala +++ b/core/src/test/scala/unit/kafka/server/RequestQuotaTest.scala @@ -180,15 +180,16 @@ class RequestQuotaTest extends BaseRequestTest { .setTargetTimes(Map(tp -> (0L: java.lang.Long)).asJava) case ApiKeys.LEADER_AND_ISR => - new LeaderAndIsrRequest.Builder(brokerId, Int.MaxValue, - Map(tp -> new PartitionState(Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava)).asJava, + new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, brokerId, Int.MaxValue, + Map(tp -> new PartitionState(Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava, true)).asJava, Set(new Node(brokerId, "localhost", 0)).asJava) case ApiKeys.STOP_REPLICA => new StopReplicaRequest.Builder(brokerId, Int.MaxValue, true, Set(tp).asJava) case ApiKeys.UPDATE_METADATA_KEY => - val partitionState = Map(tp -> new PartitionState(Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava)).asJava + val partitionState = Map(tp -> new UpdateMetadataRequest.PartitionState( + Int.MaxValue, brokerId, Int.MaxValue, List(brokerId).asJava, 2, Seq(brokerId).asJava, Seq.empty[Integer].asJava)).asJava val securityProtocol = SecurityProtocol.PLAINTEXT val brokers = Set(new UpdateMetadataRequest.Broker(brokerId, Seq(new UpdateMetadataRequest.EndPoint("localhost", 0, securityProtocol, diff --git a/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala b/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala index 72d7fc5f40efa..a20c65979e07b 100644 --- a/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala +++ b/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala @@ -16,6 +16,8 @@ */ package kafka.server +import java.io.File + import kafka.api._ import kafka.utils._ import kafka.cluster.Replica @@ -105,12 +107,13 @@ class SimpleFetchTest { // create the log manager that is aware of this mock log val logManager = EasyMock.createMock(classOf[kafka.log.LogManager]) EasyMock.expect(logManager.getLog(topicPartition)).andReturn(Some(log)).anyTimes() + EasyMock.expect(logManager.liveLogDirs).andReturn(Array.empty[File]).anyTimes() EasyMock.replay(logManager) // create the replica manager replicaManager = new ReplicaManager(configs.head, metrics, time, zkUtils, scheduler, logManager, new AtomicBoolean(false), QuotaFactory.instantiate(configs.head, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(configs.head.brokerId)) + new MetadataCache(configs.head.brokerId), new LogDirFailureChannel(configs.head.logDirs.size)) // add the partition with two replicas, both in ISR val partition = replicaManager.getOrCreatePartition(new TopicPartition(topic, partitionId)) diff --git a/core/src/test/scala/unit/kafka/server/epoch/OffsetsForLeaderEpochTest.scala b/core/src/test/scala/unit/kafka/server/epoch/OffsetsForLeaderEpochTest.scala index d004641a1b93a..e8c08fef501c7 100644 --- a/core/src/test/scala/unit/kafka/server/epoch/OffsetsForLeaderEpochTest.scala +++ b/core/src/test/scala/unit/kafka/server/epoch/OffsetsForLeaderEpochTest.scala @@ -16,6 +16,7 @@ */ package kafka.server.epoch +import java.io.File import java.util.concurrent.atomic.AtomicBoolean import kafka.cluster.Replica @@ -30,6 +31,7 @@ import org.easymock.EasyMock._ import org.junit.Assert._ import org.junit.Test + class OffsetsForLeaderEpochTest { private val config = TestUtils.createBrokerConfigs(1, TestUtils.MockZkConnect).map(KafkaConfig.fromProps).head private val time = new MockTime @@ -46,14 +48,16 @@ class OffsetsForLeaderEpochTest { //Stubs val mockLog = createNiceMock(classOf[kafka.log.Log]) val mockCache = createNiceMock(classOf[kafka.server.epoch.LeaderEpochCache]) + val logManager = createNiceMock(classOf[kafka.log.LogManager]) expect(mockCache.endOffsetFor(epochRequested)).andReturn(offset) expect(mockLog.leaderEpochCache).andReturn(mockCache).anyTimes() - replay(mockCache, mockLog) + expect(logManager.liveLogDirs).andReturn(Array.empty[File]).anyTimes() + replay(mockCache, mockLog, logManager) // create a replica manager with 1 partition that has 1 replica - val replicaManager = new ReplicaManager(config, metrics, time, null, null, null, new AtomicBoolean(false), + val replicaManager = new ReplicaManager(config, metrics, time, null, null, logManager, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(config.brokerId)) + new MetadataCache(config.brokerId), new LogDirFailureChannel(config.logDirs.size)) val partition = replicaManager.getOrCreatePartition(tp) val leaderReplica = new Replica(config.brokerId, partition, time, 0, Some(mockLog)) partition.addReplicaIfNotExists(leaderReplica) @@ -68,10 +72,14 @@ class OffsetsForLeaderEpochTest { @Test def shouldReturnNoLeaderForPartitionIfThrown(): Unit = { + val logManager = createNiceMock(classOf[kafka.log.LogManager]) + expect(logManager.liveLogDirs).andReturn(Array.empty[File]).anyTimes() + replay(logManager) + //create a replica manager with 1 partition that has 0 replica - val replicaManager = new ReplicaManager(config, metrics, time, null, null, null, new AtomicBoolean(false), + val replicaManager = new ReplicaManager(config, metrics, time, null, null, logManager, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(config.brokerId)) + new MetadataCache(config.brokerId), new LogDirFailureChannel(config.logDirs.size)) replicaManager.getOrCreatePartition(tp) //Given @@ -87,10 +95,14 @@ class OffsetsForLeaderEpochTest { @Test def shouldReturnUnknownTopicOrPartitionIfThrown(): Unit = { + val logManager = createNiceMock(classOf[kafka.log.LogManager]) + expect(logManager.liveLogDirs).andReturn(Array.empty[File]).anyTimes() + replay(logManager) + //create a replica manager with 0 partition - val replicaManager = new ReplicaManager(config, metrics, time, null, null, null, new AtomicBoolean(false), + val replicaManager = new ReplicaManager(config, metrics, time, null, null, logManager, new AtomicBoolean(false), QuotaFactory.instantiate(config, metrics, time).follower, new BrokerTopicStats, - new MetadataCache(config.brokerId)) + new MetadataCache(config.brokerId), new LogDirFailureChannel(config.logDirs.size)) //Given val epochRequested: Integer = 5 diff --git a/core/src/test/scala/unit/kafka/utils/TestUtils.scala b/core/src/test/scala/unit/kafka/utils/TestUtils.scala index 4976f52aefee2..e1e48141b0478 100755 --- a/core/src/test/scala/unit/kafka/utils/TestUtils.scala +++ b/core/src/test/scala/unit/kafka/utils/TestUtils.scala @@ -156,11 +156,12 @@ object TestUtils extends Logging { enableSsl: Boolean = false, enableSaslPlaintext: Boolean = false, enableSaslSsl: Boolean = false, - rackInfo: Map[Int, String] = Map()): Seq[Properties] = { + rackInfo: Map[Int, String] = Map(), + logDirCount: Int = 1): Seq[Properties] = { (0 until numConfigs).map { node => createBrokerConfig(node, zkConnect, enableControlledShutdown, enableDeleteTopic, RandomPort, interBrokerSecurityProtocol, trustStoreFile, saslProperties, enablePlaintext = enablePlaintext, enableSsl = enableSsl, - enableSaslPlaintext = enableSaslPlaintext, enableSaslSsl = enableSaslSsl, rack = rackInfo.get(node)) + enableSaslPlaintext = enableSaslPlaintext, enableSaslSsl = enableSaslSsl, rack = rackInfo.get(node), logDirCount = logDirCount) } } @@ -205,7 +206,7 @@ object TestUtils extends Logging { enablePlaintext: Boolean = true, enableSaslPlaintext: Boolean = false, saslPlaintextPort: Int = RandomPort, enableSsl: Boolean = false, sslPort: Int = RandomPort, - enableSaslSsl: Boolean = false, saslSslPort: Int = RandomPort, rack: Option[String] = None) + enableSaslSsl: Boolean = false, saslSslPort: Int = RandomPort, rack: Option[String] = None, logDirCount: Int = 1) : Properties = { def shouldEnable(protocol: SecurityProtocol) = interBrokerSecurityProtocol.fold(false)(_ == protocol) @@ -227,7 +228,8 @@ object TestUtils extends Logging { val props = new Properties if (nodeId >= 0) props.put(KafkaConfig.BrokerIdProp, nodeId.toString) props.put(KafkaConfig.ListenersProp, listeners) - props.put(KafkaConfig.LogDirProp, TestUtils.tempDir().getAbsolutePath) + val logDir = (1 to logDirCount).toList.map(i => TestUtils.tempDir().getAbsolutePath).mkString(",") + props.put(KafkaConfig.LogDirProp, logDir) props.put(KafkaConfig.ZkConnectProp, zkConnect) props.put(KafkaConfig.ZkConnectionTimeoutMsProp, "10000") props.put(KafkaConfig.ReplicaSocketTimeoutMsProp, "1500") @@ -1006,10 +1008,13 @@ object TestUtils extends Logging { * Create new LogManager instance with default configuration for testing */ def createLogManager(logDirs: Array[File] = Array.empty[File], + zkUtils: ZkUtils, defaultConfig: LogConfig = LogConfig(), cleanerConfig: CleanerConfig = CleanerConfig(enableCleaner = false), - time: MockTime = new MockTime()): LogManager = { + time: MockTime = new MockTime(), + brokerId: Int = 0): LogManager = { new LogManager(logDirs = logDirs, + initialOfflineDirs = Array.empty[File], topicConfigs = Map(), defaultConfig = defaultConfig, cleanerConfig = cleanerConfig, @@ -1021,8 +1026,9 @@ object TestUtils extends Logging { maxPidExpirationMs = 60 * 60 * 1000, scheduler = time.scheduler, time = time, - brokerState = BrokerState(), - brokerTopicStats = new BrokerTopicStats) + brokerState = new BrokerState(), + brokerTopicStats = new BrokerTopicStats, + logDirFailureChannel = new LogDirFailureChannel(logDirs.size)) } @deprecated("This method has been deprecated and it will be removed in a future release.", "0.10.0.0") @@ -1161,7 +1167,7 @@ object TestUtils extends Logging { servers.forall(server => topicPartitions.forall(tp => server.getLogManager().getLog(tp).isEmpty))) // ensure that topic is removed from all cleaner offsets TestUtils.waitUntilTrue(() => servers.forall(server => topicPartitions.forall { tp => - val checkpoints = server.getLogManager().logDirs.map { logDir => + val checkpoints = server.getLogManager().liveLogDirs.map { logDir => new OffsetCheckpointFile(new File(logDir, "cleaner-offset-checkpoint")).read() } checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp)) diff --git a/docs/upgrade.html b/docs/upgrade.html index e61f6c7de2d90..63477e528b894 100644 --- a/docs/upgrade.html +++ b/docs/upgrade.html @@ -17,7 +17,54 @@ -