airbnb · whua-airbnb · Oct 17, 2014 · Oct 10, 2014 · Oct 10, 2014 · krishnap
diff --git a/camus-etl-kafka/pom.xml b/camus-etl-kafka/pom.xml
@@ -1,123 +1,128 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-	<modelVersion>4.0.0</modelVersion>
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
 
-	<parent>
-		<groupId>com.linkedin.camus</groupId>
-		<artifactId>camus-parent</artifactId>
-		<version>0.1.0-SNAPSHOT</version>
-	</parent>
+    <parent>
+        <groupId>com.linkedin.camus</groupId>
+        <artifactId>camus-parent</artifactId>
+        <version>0.1.0-SNAPSHOT</version>
+    </parent>
 
-	<artifactId>camus-etl-kafka</artifactId>
-	<name>Camus ETL to move data from Kafka to Hadoop.</name>
-	<packaging>jar</packaging>
-	<dependencies>
-		<dependency>
-			<groupId>com.linkedin.camus</groupId>
-			<artifactId>camus-api</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.linkedin.camus</groupId>
-			<artifactId>camus-schema-registry</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.linkedin.camus</groupId>
-			<artifactId>camus-kafka-coders</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>joda-time</groupId>
-			<artifactId>joda-time</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>org.apache.avro</groupId>
-			<artifactId>avro-mapred</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-client</artifactId>
-			<scope>provided</scope>
-		</dependency>
-		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-			<scope>provided</scope>
-		</dependency>
-		<dependency>
-			<groupId>org.apache.kafka</groupId>
-			<artifactId>kafka_2.10</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.github.sgroschupf</groupId>
-			<artifactId>zkclient</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>org.apache.zookeeper</groupId>
-			<artifactId>zookeeper</artifactId>
-			<exclusions>
-				<exclusion>
-					<groupId>log4j</groupId>
-					<artifactId>log4j</artifactId>
-				</exclusion>
-				<exclusion>
-					<groupId>jline</groupId>
-					<artifactId>jline</artifactId>
-				</exclusion>
-			</exclusions>
-		</dependency>
-		<dependency>
-			<groupId>org.scala-lang</groupId>
-			<artifactId>scala-library</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>commons-cli</groupId>
-			<artifactId>commons-cli</artifactId>
-			<version>1.2</version>
-		</dependency>
-		<dependency>
-			<groupId>junit</groupId>
-			<artifactId>junit</artifactId>
-			<version>4.8.1</version>
-		</dependency>
-	</dependencies>
-
-	<build>
-		<plugins>
-			<plugin>
-				<groupId>org.apache.avro</groupId>
-				<artifactId>avro-maven-plugin</artifactId>
-				<version>1.7.0</version>
-				<executions> 
- 					 <execution> <id>schemas</id> <phase>generate-sources</phase> <goals>  
- 						<goal>schema</goal> <goal>protocol</goal> <goal>idl-protocol</goal> </goals>  
- 						<configuration> <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory> 
- 						<outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
-						<lifecycleMappingMetadata> <action> <ignore /> </action> </lifecycleMappingMetadata></configuration> 
- 						</execution> 
- 				</executions> 
-			</plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <configuration>
-          <descriptorRefs>
-            <descriptorRef>jar-with-dependencies</descriptorRef>
-          </descriptorRefs>
-          <archive>
-            <manifest>
-              <mainClass></mainClass>
-            </manifest>
-          </archive>
-        </configuration>
-        <executions>
-          <execution>
-            <id>make-assembly</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-		</plugins>
-	</build>
+    <artifactId>camus-etl-kafka</artifactId>
+    <name>Camus ETL to move data from Kafka to Hadoop.</name>
+    <packaging>jar</packaging>
+    <dependencies>
+        <dependency>
+            <groupId>com.linkedin.camus</groupId>
+            <artifactId>camus-api</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.linkedin.camus</groupId>
+            <artifactId>camus-schema-registry</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.linkedin.camus</groupId>
+            <artifactId>camus-kafka-coders</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>joda-time</groupId>
+            <artifactId>joda-time</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro-mapred</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-client</artifactId>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.kafka</groupId>
+            <artifactId>kafka_2.10</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.github.sgroschupf</groupId>
+            <artifactId>zkclient</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.zookeeper</groupId>
+            <artifactId>zookeeper</artifactId>
+            <exclusions>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>jline</groupId>
+                    <artifactId>jline</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>commons-cli</groupId>
+            <artifactId>commons-cli</artifactId>
+            <version>1.2</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.8.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>1.9.5</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.avro</groupId>
+                <artifactId>avro-maven-plugin</artifactId>
+                <version>1.7.0</version>
+                <executions>
+                     <execution> <id>schemas</id> <phase>generate-sources</phase> <goals>
+                        <goal>schema</goal> <goal>protocol</goal> <goal>idl-protocol</goal> </goals>
+                        <configuration> <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory>
+                        <outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
+                        <lifecycleMappingMetadata> <action> <ignore /> </action> </lifecycleMappingMetadata></configuration>
+                        </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <configuration>
+                    <descriptorRefs>
+                      <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                <archive>
+                    <manifest>
+                        <mainClass></mainClass>
+                    </manifest>
+                </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>make-assembly</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
 </project>
diff --git a/camus-etl-kafka/src/main/java/com/linkedin/camus/etl/kafka/common/StringRecordWriter.java b/camus-etl-kafka/src/main/java/com/linkedin/camus/etl/kafka/common/StringRecordWriter.java
@@ -0,0 +1,37 @@
+package com.linkedin.camus.etl.kafka.common;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import com.linkedin.camus.coders.CamusWrapper;
+import com.linkedin.camus.etl.IEtlKey;
+
+@SuppressWarnings("rawtypes")
+public class StringRecordWriter extends RecordWriter<IEtlKey, CamusWrapper> {
+  private final OutputStream output;
+  private final String recordDelimiter;
+
+  public StringRecordWriter(OutputStream compressedOutput,
+      String recordDelimiter) {
+    this.output = compressedOutput;
+    this.recordDelimiter = recordDelimiter;
+  }
+
+  @Override
+  public void write(IEtlKey ignore, CamusWrapper data) throws IOException {
+    String record = (String) data.getRecord() + recordDelimiter;
+    // Need to specify Charset because the default might not be UTF-8.
+    // Bug fix for https://jira.airbnb.com:8443/browse/PRODUCT-5551.
+    output.write(record.getBytes(Charset.forName("UTF-8")));
+  }
+
+  @Override
+  public void close(TaskAttemptContext context) throws IOException,
+      InterruptedException {
+    output.close();
+  }
+}
diff --git a/...l-kafka/src/main/java/com/linkedin/camus/etl/kafka/common/StringRecordWriterProvider.java b/...l-kafka/src/main/java/com/linkedin/camus/etl/kafka/common/StringRecordWriterProvider.java
@@ -1,27 +1,29 @@
 package com.linkedin.camus.etl.kafka.common;
 
-import com.linkedin.camus.coders.CamusWrapper;
-import com.linkedin.camus.etl.IEtlKey;
-import com.linkedin.camus.etl.RecordWriterProvider;
-import com.linkedin.camus.etl.kafka.mapred.EtlMultiOutputFormat;
 import java.io.IOException;
+
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
-import org.apache.hadoop.io.compress.CompressionCodecFactory;
-import org.apache.hadoop.io.compress.CompressionCodec;
-import org.apache.hadoop.io.compress.CompressionOutputStream;
 
-import org.apache.log4j.Logger;
+import com.linkedin.camus.coders.CamusWrapper;
+import com.linkedin.camus.etl.IEtlKey;
+import com.linkedin.camus.etl.RecordWriterProvider;
+import com.linkedin.camus.etl.kafka.mapred.EtlMultiOutputFormat;
 
 
 /**
  * Provides a RecordWriter that uses FSDataOutputStream to write
  * a String record as bytes to HDFS without any reformatting or compession.
  */
 public class StringRecordWriterProvider implements RecordWriterProvider {
+
+
     public static final String ETL_OUTPUT_RECORD_DELIMITER = "etl.output.record.delimiter";
     public static final String DEFAULT_RECORD_DELIMITER    = "";
 
@@ -35,6 +37,7 @@ public String getFilenameExtension() {
         return ".gz";
     }
 
+    @SuppressWarnings("rawtypes")
     @Override
     public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(
             TaskAttemptContext  context,
@@ -65,19 +68,6 @@ context, fileName, getFilenameExtension()
         CompressionCodec codec = codecFactory.getCodec(path);
         final CompressionOutputStream compressedOutput = codec.createOutputStream(writer);
 
-        // Return a new anonymous RecordWriter that uses the
-        // FSDataOutputStream writer to write bytes straight into path.
-        return new RecordWriter<IEtlKey, CamusWrapper>() {
-            @Override
-            public void write(IEtlKey ignore, CamusWrapper data) throws IOException {
-                String record = (String)data.getRecord() + recordDelimiter;
-                compressedOutput.write(record.getBytes());
-            }
-
-            @Override
-            public void close(TaskAttemptContext context) throws IOException, InterruptedException {
-                compressedOutput.close();
-            }
-        };
+        return new StringRecordWriter(compressedOutput, recordDelimiter);
     }
 }