apache · enothereska · Mar 5, 2017 · Mar 5, 2017 · Mar 6, 2017 · Mar 6, 2017
diff --git a/streams/src/main/java/org/apache/kafka/streams/StreamsConfig.java b/streams/src/main/java/org/apache/kafka/streams/StreamsConfig.java
@@ -240,8 +240,8 @@ public class StreamsConfig extends AbstractConfig {
                     STATE_DIR_DOC)
             .define(REPLICATION_FACTOR_CONFIG,
                     Type.INT,
-                    1,
-                    Importance.MEDIUM,
+                    3,
+                    Importance.HIGH,
                     REPLICATION_FACTOR_DOC)
             .define(TIMESTAMP_EXTRACTOR_CLASS_CONFIG,
                     Type.CLASS,
@@ -391,6 +391,8 @@ public class StreamsConfig extends AbstractConfig {
     static {
         final Map<String, Object> tempProducerDefaultOverrides = new HashMap<>();
         tempProducerDefaultOverrides.put(ProducerConfig.LINGER_MS_CONFIG, "100");
+        tempProducerDefaultOverrides.put(ProducerConfig.RETRIES_CONFIG, Integer.MAX_VALUE);
+        tempProducerDefaultOverrides.put(ProducerConfig.ACKS_CONFIG, "all");
 
         PRODUCER_DEFAULT_OVERRIDES = Collections.unmodifiableMap(tempProducerDefaultOverrides);
     }

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/InternalTopicManager.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/InternalTopicManager.java
@@ -55,16 +55,29 @@ public InternalTopicManager(final StreamsKafkaClient streamsKafkaClient, final i
      * If a topic exists already but has different number of partitions we fail and throw exception requesting user to reset the app before restarting again.
      */
     public void makeReady(final Map<InternalTopicConfig, Integer> topics) {
+        int actualReplicationFactor = replicationFactor;
         for (int i = 0; i < MAX_TOPIC_READY_TRY; i++) {
             try {
                 final MetadataResponse metadata = streamsKafkaClient.fetchMetadata();
                 final Map<String, Integer> existingTopicPartitions = fetchExistingPartitionCountByTopic(metadata);
                 final Map<InternalTopicConfig, Integer> topicsToBeCreated = validateTopicPartitions(topics, existingTopicPartitions);
-                streamsKafkaClient.createTopics(topicsToBeCreated, replicationFactor, windowChangeLogAdditionalRetention, metadata);
+                if (metadata.brokers().size() > 0 && metadata.brokers().size() < replicationFactor) {
+                    log.warn("The number of available brokers {} is less than the desired replication " +
+                            "factor for streams internal topics {}. If running in production, consider " +
+                            "increasing the number of available brokers.",
+                        metadata.brokers().size(), replicationFactor);
+                    actualReplicationFactor = metadata.brokers().size();
+                }
+                streamsKafkaClient.createTopics(topicsToBeCreated, actualReplicationFactor, windowChangeLogAdditionalRetention, metadata);
                 return;
             } catch (StreamsException ex) {
                 log.warn("Could not create internal topics: " + ex.getMessage() + " Retry #" + i);
             }
+            try {
+                Thread.sleep(1000L);
+            } catch (InterruptedException e) {
+                // ignore
+            }
         }
         throw new StreamsException("Could not create internal topics.");
     }

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/StreamThread.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/StreamThread.java
@@ -400,17 +400,21 @@ private RuntimeException unAssignChangeLogPartitions() {
 
     @SuppressWarnings("ThrowableNotThrown")
     private void shutdownTasksAndState() {
-        log.debug("{} shutdownTasksAndState: shutting down all active tasks {} and standby tasks {}", logPrefix,
-            activeTasks.keySet(), standbyTasks.keySet());
+        log.debug("{} shutdownTasksAndState: shutting down all active tasks {} " +
+                "and standby tasks {} and suspended tasks {} and suspended standby tasks {}", logPrefix,
+            activeTasks.keySet(), standbyTasks.keySet(),
+            suspendedTasks.keySet(), suspendedStandbyTasks.keySet());
 
         final AtomicReference<RuntimeException> firstException = new AtomicReference<>(null);
         // Close all processors in topology order
         firstException.compareAndSet(null, closeAllTasks());
+        firstException.compareAndSet(null, closeAllSuspendedTasks());
         // flush state
         firstException.compareAndSet(null, flushAllState());
         // Close all task state managers. Don't need to set exception as all
         // state would have been flushed above
         closeAllStateManagers(firstException.get() == null);
+        closeAllSuspendedStateManagers(firstException.get() == null);
         // only commit under clean exit
         if (cleanRun && firstException.get() == null) {
             firstException.set(commitOffsets());
@@ -475,6 +479,29 @@ private RuntimeException performOnAllTasks(final AbstractTaskAction action,
         return firstException;
     }
 
+    private RuntimeException performOnAllSuspendedTasks(final AbstractTaskAction action,
+                                               final String exceptionMessage) {
+        RuntimeException firstException = null;
+        final List<AbstractTask> allTasks = new ArrayList<AbstractTask>(suspendedTasks.values());
+        allTasks.addAll(suspendedStandbyTasks.values());
+        for (final AbstractTask task : allTasks) {
+            try {
+                action.apply(task);
+            } catch (RuntimeException t) {
+                log.error("{} Failed while executing {} {} due to {}: ",
+                    StreamThread.this.logPrefix,
+                    task.getClass().getSimpleName(),
+                    task.id(),
+                    exceptionMessage,
+                    t);
+                if (firstException == null) {
+                    firstException = t;
+                }
+            }
+        }
+        return firstException;
+    }
+
     private Throwable closeAllStateManagers(final boolean writeCheckpoint) {
         return performOnAllTasks(new AbstractTaskAction() {
             @Override
@@ -485,6 +512,16 @@ public void apply(final AbstractTask task) {
         }, "close state manager");
     }
 
+    private Throwable closeAllSuspendedStateManagers(final boolean writeCheckpoint) {
+        return performOnAllSuspendedTasks(new AbstractTaskAction() {
+            @Override
+            public void apply(final AbstractTask task) {
+                log.info("{} Closing the state manager of task {}", StreamThread.this.logPrefix, task.id());
+                task.closeStateManager(writeCheckpoint);
+            }
+        }, "close state manager");
+    }
+
     private RuntimeException commitOffsets() {
         // Exceptions should not prevent this call from going through all shutdown steps
         return performOnAllTasks(new AbstractTaskAction() {
@@ -575,6 +612,10 @@ private void runLoop() {
 
                     for (TopicPartition partition : records.partitions()) {
                         StreamTask task = activeTasksByPartition.get(partition);
+                        if (task == null) {
+                            log.warn("No active tasks for partition " + partition);
+                            continue;
+                        }
                         numAddedRecords += task.addRecords(partition, records.records(partition));
                     }
                     streamsMetrics.skippedRecordsSensor.record(records.count() - numAddedRecords, timerStartedMs);
@@ -1020,6 +1061,17 @@ public void apply(final AbstractTask task) {
         }, "close");
     }
 
+    private RuntimeException closeAllSuspendedTasks() {
+        return performOnAllSuspendedTasks(new AbstractTaskAction() {
+            @Override
+            public void apply(final AbstractTask task) {
+                log.info("{} Closing task {}", StreamThread.this.logPrefix, task.id());
+                task.close();
+                streamsMetrics.tasksClosedSensor.record();
+            }
+        }, "close");
+    }
+
     private RuntimeException closeAllTasksTopologies() {
         return performOnAllTasks(new AbstractTaskAction() {
             @Override

diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/StreamsKafkaClient.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/StreamsKafkaClient.java
@@ -203,7 +203,11 @@ private String ensureOneNodeIsReady(final List<Node> nodes) {
                     break;
                 }
             }
-            kafkaClient.poll(streamsConfig.getLong(StreamsConfig.POLL_MS_CONFIG), Time.SYSTEM.milliseconds());
+            try {
+                kafkaClient.poll(streamsConfig.getLong(StreamsConfig.POLL_MS_CONFIG), Time.SYSTEM.milliseconds());
+            } catch (Exception e) {
+                throw new StreamsException("Could not poll.");
+            }
         }
         if (brokerId == null) {
             throw new StreamsException("Could not find any available broker.");
@@ -268,7 +272,12 @@ public MetadataResponse fetchMetadata() {
             new MetadataRequest.Builder(null),
             Time.SYSTEM.milliseconds(),
             true);
-        final ClientResponse clientResponse = sendRequest(clientRequest);
+        ClientResponse clientResponse;
+        try {
+            clientResponse = sendRequest(clientRequest);
+        } catch (Exception e) {
+            throw new StreamsException("Failed to send request");
+        }
         if (!clientResponse.hasResponse()) {
             throw new StreamsException("Empty response for client request.");
         }

diff --git a/streams/src/test/java/org/apache/kafka/streams/tests/SmokeTestClient.java b/streams/src/test/java/org/apache/kafka/streams/tests/SmokeTestClient.java
@@ -39,6 +39,7 @@ public class SmokeTestClient extends SmokeTestUtil {
     private final File stateDir;
     private KafkaStreams streams;
     private Thread thread;
+    private boolean uncaughtException = false;
 
     public SmokeTestClient(File stateDir, String kafka) {
         super();
@@ -51,10 +52,19 @@ public void start() {
         streams.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
             @Override
             public void uncaughtException(Thread t, Throwable e) {
+                System.out.println("SMOKE-TEST-CLIENT-EXCEPTION");
+                uncaughtException = true;
                 e.printStackTrace();
             }
         });
 
+        Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
+            @Override
+            public void run() {
+                close();
+            }
+        }));
+
         thread = new Thread() {
             public void run() {
                 streams.start();
@@ -64,10 +74,16 @@ public void run() {
     }
 
     public void close() {
-        streams.close();
+        streams.close(5, TimeUnit.SECONDS);
+        // do not remove these printouts since they are needed for health scripts
+        if (!uncaughtException) {
+            System.out.println("SMOKE-TEST-CLIENT-CLOSED");
+        }
         try {
             thread.join();
         } catch (Exception ex) {
+            // do not remove these printouts since they are needed for health scripts
+            System.out.println("SMOKE-TEST-CLIENT-EXCEPTION");
             // ignore
         }
     }
@@ -77,19 +93,17 @@ private static KafkaStreams createKafkaStreams(File stateDir, String kafka) {
         props.put(StreamsConfig.APPLICATION_ID_CONFIG, "SmokeTest");
         props.put(StreamsConfig.STATE_DIR_CONFIG, stateDir.toString());
         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, kafka);
+        // TODO: set number of threads back to 3 once
+        // https://issues.apache.org/jira/browse/KAFKA-3758 is solved
         props.put(StreamsConfig.NUM_STREAM_THREADS_CONFIG, 3);
         props.put(StreamsConfig.NUM_STANDBY_REPLICAS_CONFIG, 2);
         props.put(StreamsConfig.BUFFERED_RECORDS_PER_PARTITION_CONFIG, 100);
-        props.put(StreamsConfig.REPLICATION_FACTOR_CONFIG, 2);
         props.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 1000);
         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
 
         KStreamBuilder builder = new KStreamBuilder();
-
         KStream<String, Integer> source = builder.stream(stringSerde, intSerde, "data");
-
         source.to(stringSerde, intSerde, "echo");
-
         KStream<String, Integer> data = source.filter(new Predicate<String, Integer>() {
             @Override
             public boolean test(String key, Integer value) {

diff --git a/streams/src/test/java/org/apache/kafka/streams/tests/SmokeTestDriver.java b/streams/src/test/java/org/apache/kafka/streams/tests/SmokeTestDriver.java
@@ -136,6 +136,12 @@ public static Map<String, Set<Integer>> generate(String kafka, final int numKeys
         props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka);
         props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
         props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
+        // the next 4 config values make sure that all records are produced with no loss and
+        // no duplicates
+        props.put(ProducerConfig.RETRIES_CONFIG, Integer.MAX_VALUE);
+        props.put(ProducerConfig.ACKS_CONFIG, "all");
+        props.put(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, 1);
+        props.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, 60 * 1000);
 
         KafkaProducer<byte[], byte[]> producer = new KafkaProducer<>(props);