Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

AVRO-1202. Java & Python: Add Getting Started guides. Contributed by …

…Skye Wanderman-Milne.

git-svn-id: https://svn.apache.org/repos/asf/avro/trunk@1410619 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit 22d55ddf3e6ff8021da1743c8bb989111391b32a 1 parent c8fe3e0
Doug Cutting cutting authored
3  CHANGES.txt
View
@@ -18,6 +18,9 @@ Trunk (not yet released)
AVRO-1188. Java: Permit external schema imports for schemas in
Maven plugin. (Sharmarke Aden via tomwhite)
+ AVRO-1202. Java & Python: Add "Getting Started" guides.
+ (Skye Wanderman-Milne via cutting)
+
IMPROVEMENTS
AVRO-1169. Java: Reduce memory footprint of resolver.
15 doc/examples/example.py
View
@@ -0,0 +1,15 @@
+import avro.schema
+from avro.datafile import DataFileReader, DataFileWriter
+from avro.io import DatumReader, DatumWriter
+
+schema = avro.schema.parse(open("user.avsc").read())
+
+writer = DataFileWriter(open("/tmp/users.avro", "w"), DatumWriter(), schema)
+writer.append({"name": "Alyssa", "favorite_number": 256, "WTF": 2})
+writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
+writer.close()
+
+reader = DataFileReader(open("/tmp/users.avro", "r"), DatumReader())
+for user in reader:
+ print user
+reader.close()
52 doc/examples/java-example/pom.xml
View
@@ -0,0 +1,52 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>example</groupId>
+ <artifactId>java-example</artifactId>
+ <packaging>jar</packaging>
+ <version>1.0-SNAPSHOT</version>
+ <name>java-example</name>
+ <url>http://maven.apache.org</url>
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ <version>1.7.2</version>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro-maven-plugin</artifactId>
+ <version>1.7.2</version>
+ <executions>
+ <execution>
+ <phase>generate-sources</phase>
+ <goals>
+ <goal>schema</goal>
+ </goals>
+ <configuration>
+ <sourceDirectory>${project.basedir}/../</sourceDirectory>
+ <outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
53 doc/examples/java-example/src/main/java/example/GenericMain.java
View
@@ -0,0 +1,53 @@
+package example;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Parser;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.io.DatumWriter;
+
+public class GenericMain {
+ public static void main(String[] args) throws IOException {
+ Schema schema = new Parser().parse(new File("/home/skye/code/cloudera/avro/doc/examples/user.avsc"));
+
+ GenericRecord user1 = new GenericData.Record(schema);
+ user1.put("name", "Alyssa");
+ user1.put("favorite_number", 256);
+ // Leave favorite color null
+
+ GenericRecord user2 = new GenericData.Record(schema);
+ user2.put("name", "Ben");
+ user2.put("favorite_number", 7);
+ user2.put("favorite_color", "red");
+
+ // Serialize user1 and user2 to disk
+ File file = new File("users.avro");
+ DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
+ DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
+ dataFileWriter.create(schema, file);
+ dataFileWriter.append(user1);
+ dataFileWriter.append(user2);
+ dataFileWriter.close();
+
+ // Deserialize users from disk
+ DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
+ DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(file, datumReader);
+ GenericRecord user = null;
+ while (dataFileReader.hasNext()) {
+ // Reuse user object by passing it to next(). This saves us from
+ // allocating and garbage collecting many objects for files with
+ // many items.
+ user = dataFileReader.next(user);
+ System.out.println(user);
+ }
+
+ }
+}
55 doc/examples/java-example/src/main/java/example/SpecificMain.java
View
@@ -0,0 +1,55 @@
+package example;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.io.DatumWriter;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.avro.specific.SpecificDatumWriter;
+
+import example.avro.User;
+
+public class SpecificMain {
+ public static void main(String[] args) throws IOException {
+ User user1 = new User();
+ user1.setName("Alyssa");
+ user1.setFavoriteNumber(256);
+ // Leave favorite color null
+
+ // Alternate constructor
+ User user2 = new User("Ben", 7, "red");
+
+ // Construct via builder
+ User user3 = User.newBuilder()
+ .setName("Charlie")
+ .setFavoriteColor("blue")
+ .setFavoriteNumber(null)
+ .build();
+
+ // Serialize user1 and user2 to disk
+ File file = new File("users.avro");
+ DatumWriter<User> userDatumWriter = new SpecificDatumWriter<User>(User.class);
+ DataFileWriter<User> dataFileWriter = new DataFileWriter<User>(userDatumWriter);
+ dataFileWriter.create(user1.getSchema(), file);
+ dataFileWriter.append(user1);
+ dataFileWriter.append(user2);
+ dataFileWriter.append(user3);
+ dataFileWriter.close();
+
+ // Deserialize Users from disk
+ DatumReader<User> userDatumReader = new SpecificDatumReader<User>(User.class);
+ DataFileReader<User> dataFileReader = new DataFileReader<User>(file, userDatumReader);
+ User user = null;
+ while (dataFileReader.hasNext()) {
+ // Reuse user object by passing it to next(). This saves us from
+ // allocating and garbage collecting many objects for files with
+ // many items.
+ user = dataFileReader.next(user);
+ System.out.println(user);
+ }
+
+ }
+}
9 doc/examples/user.avsc
View
@@ -0,0 +1,9 @@
+{"namespace": "example.avro",
+ "type": "record",
+ "name": "User",
+ "fields": [
+ {"name": "name", "type": "string"},
+ {"name": "favorite_number", "type": ["int", "null"]},
+ {"name": "favorite_color", "type": ["string", "null"]}
+ ]
+}
477 doc/src/content/xdocs/gettingstartedjava.xml
View
@@ -0,0 +1,477 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
+ "http://forrest.apache.org/dtd/document-v20.dtd" [
+ <!ENTITY % avro-entities PUBLIC "-//Apache//ENTITIES Avro//EN"
+ "../../../../build/avro.ent">
+ %avro-entities;
+]>
+<document>
+ <header>
+ <title>Apache Avro&#153; &AvroVersion; Getting Started (Java)</title>
+ </header>
+ <body>
+ <p>
+ This is a short guide for getting started with Apache Avro&#153; using
+ Java. This guide only covers using Avro for data serialization; see
+ Patrick Hunt's <a href="https://github.com/phunt/avro-rpc-quickstart">Avro
+ RPC Quick Start</a> for a good introduction to using Avro for RPC.
+ </p>
+ <section id="download_install">
+ <title>Download</title>
+ <p>
+ Avro implementations for C, C++, C#, Java, PHP, Python, and Ruby can be
+ downloaded from the <a
+ href="http://avro.apache.org/releases.html">Apache Avro&#153;
+ Releases</a> page. This guide uses Avro &AvroVersion;, the latest
+ version at the time of writing. For the examples in this guide,
+ download <em>avro-&AvroVersion;.jar</em> and
+ <em>avro-tools-&AvroVersion;.jar</em>. The Avro Java implementation
+ also depends on the <a href="http://jackson.codehaus.org/">Jackson</a>
+ JSON library. From the Jackson <a
+ href="http://wiki.fasterxml.com/JacksonDownload">download page</a>,
+ download the core-asl and mapper-asl jars. Add
+ <em>avro-&AvroVersion;.jar</em> and the Jackson jars to your project's
+ classpath (avro-tools will be used for code generation).
+ </p>
+ <p>
+ Alternatively, if you are using Maven, add the following dependency to
+ your POM:
+ </p>
+ <source>
+&#60;dependency&#62;
+ &#60;groupId&#62;org.apache.avro&#60;/groupId&#62;
+ &#60;artifactId&#62;avro&#60;/artifactId&#62;
+ &#60;version&#62;&AvroVersion;&#60;/version&#62;
+&#60;/dependency&#62;
+ </source>
+ <p>
+ As well as the Avro Maven plugin (for performing code generation):
+ </p>
+ <source>
+&#60;plugin&#62;
+ &#60;groupId&#62;org.apache.avro&#60;/groupId&#62;
+ &#60;artifactId&#62;avro-maven-plugin&#60;/artifactId&#62;
+ &#60;version&#62;&AvroVersion;&#60;/version&#62;
+ &#60;executions&#62;
+ &#60;execution&#62;
+ &#60;phase&#62;generate-sources&#60;/phase&#62;
+ &#60;goals&#62;
+ &#60;goal&#62;schema&#60;/goal&#62;
+ &#60;/goals&#62;
+ &#60;configuration&#62;
+ &#60;sourceDirectory&#62;${project.basedir}/src/main/avro/&#60;/sourceDirectory&#62;
+ &#60;outputDirectory&#62;${project.basedir}/src/main/java/&#60;/outputDirectory&#62;
+ &#60;/configuration&#62;
+ &#60;/execution&#62;
+ &#60;/executions&#62;
+&#60;/plugin&#62;
+&#60;plugin&#62;
+ &#60;groupId&#62;org.apache.maven.plugins&#60;/groupId&#62;
+ &#60;artifactId&#62;maven-compiler-plugin&#60;/artifactId&#62;
+ &#60;configuration&#62;
+ &#60;source&#62;1.6&#60;/source&#62;
+ &#60;target&#62;1.6&#60;/target&#62;
+ &#60;/configuration&#62;
+&#60;/plugin&#62;
+ </source>
+ <p>
+ You may also build the required Avro jars from source. Building Avro is
+ beyond the scope of this guide; see the <a
+ href="https://cwiki.apache.org/AVRO/build-documentation.html">Build
+ Documentation</a> page in the wiki for more information.
+ </p>
+ </section>
+
+ <section>
+ <title>Defining a schema</title>
+ <p>
+ Avro schemas are defined using JSON. Schemas are composed of <a
+ href="spec.html#schema_primitive">primitive types</a>
+ (<code>null</code>, <code>boolean</code>, <code>int</code>,
+ <code>long</code>, <code>float</code>, <code>double</code>,
+ <code>bytes</code>, and <code>string</code>) and <a
+ href="spec.html#schema_complex">complex types</a> (<code>record</code>,
+ <code>enum</code>, <code>array</code>, <code>map</code>,
+ <code>union</code>, and <code>fixed</code>). You can learn more about
+ Avro schemas and types from the specification, but for now let's start
+ with a simple schema example, <em>user.avsc</em>:
+ </p>
+ <source>
+{"namespace": "example.avro",
+ "type": "record",
+ "name": "User",
+ "fields": [
+ {"name": "name", "type": "string"},
+ {"name": "favorite_number", "type": ["int", "null"]},
+ {"name": "favorite_color", "type": ["string", "null"]}
+ ]
+}
+ </source>
+ <p>
+ This schema defines a record representing a hypothetical user. (Note
+ that a schema file can only contain a single schema definition.) At
+ minimum, a record definition must include its type (<code>"type":
+ "record"</code>), a name (<code>"name": "User"</code>), and fields, in
+ this case <code>name</code>, <code>favorite_number</code>, and
+ <code>favorite_color</code>. We also define a namespace
+ (<code>"namespace": "example.avro"</code>), which together with the name
+ attribute defines the "full name" of the schema
+ (<code>example.avro.User</code> in this case).
+
+ </p>
+ <p>
+ Fields are defined via an array of objects, each of which defines a name
+ and type (other attributes are optional, see the <a
+ href="spec.html#schema_record">record specification</a> for more
+ details). The type attribute of a field is another schema object, which
+ can be either a primitive or complex type. For example, the
+ <code>name</code> field of our User schema is the primitive type
+ <code>string</code>, whereas the <code>favorite_number</code> and
+ <code>favorite_color</code> fields are both <code>union</code>s,
+ represented by JSON arrays. <code>union</code>s are a complex type that
+ can be any of the types listed in the array; e.g.,
+ <code>favorite_number</code> can either be an <code>int</code> or
+ <code>null</code>, essentially making it an optional field.
+ </p>
+ </section>
+
+ <section>
+ <title>Serializing and deserializing with code generation</title>
+ <section>
+ <title>Compiling the schema</title>
+ <p>
+ Code generation allows us to automatically create classes based on our
+ previously-defined schema. Once we have defined the relevant classes,
+ there is no need to use the schema directly in our programs. We use the
+ avro-tools jar to generate code as follows:
+ </p>
+ <source>
+java -jar /path/to/avro-tools-&AvroVersion;.jar compile schema &#60;schema file&#62; &#60;destination&#62;
+ </source>
+ <p>
+ This will generate the appropriate source files in a package based on
+ the schema's namespace in the provided destination folder. For
+ instance, to generate a <code>User</code> class in package
+ <code>example.avro</code> from the schema defined above, run
+ </p>
+ <source>
+java -jar /path/to/avro-tools-&AvroVersion;.jar compile schema user.avsc .
+ </source>
+ <p>
+ Note that if you using the Avro Maven plugin, there is no need to
+ manually invoke the schema compiler; the plugin automatically
+ performs code generation on any .avsc files present in the configured
+ source directory.
+ </p>
+ </section>
+ <section>
+ <title>Creating Users</title>
+ <p>
+ Now that we've completed the code generation, let's create some
+ <code>User</code>s, serialize them to a data file on disk, and then
+ read back the file and deserialize the <code>User</code> objects.
+ </p>
+ <p>
+ First let's create some <code>User</code>s and set their fields.
+ </p>
+ <source>
+User user1 = new User();
+user1.setName("Alyssa");
+user1.setFavoriteNumber(256);
+// Leave favorite color null
+
+// Alternate constructor
+User user2 = new User("Ben", 7, "red");
+
+// Construct via builder
+User user3 = User.newBuilder()
+ .setName("Charlie")
+ .setFavoriteColor("blue")
+ .setFavoriteNumber(null)
+ .build();
+ </source>
+ <p>
+ As shown in this example, Avro objects can be created either by
+ invoking a constructor directly or by using a builder. Unlike
+ constructors, builders will automatically set any default values
+ specified in the schema. Additionally, builders validate the data as
+ it set, whereas objects constructed directly will not cause an error
+ until the object is serialized. However, using constructors directly
+ generally offers better performance, as builders create a copy of the
+ datastructure before it is written.
+ </p>
+ <p>
+ Note that we do not set <code>user1</code>'s favorite color. Since
+ that record is of type <code>["string", "null"]</code>, we can either
+ set it to a <code>string</code> or leave it <code>null</code>; it is
+ essentially optional. Similarly, we set <code>user3</code>'s favorite
+ number to null (using a builder requires setting all fields, even if
+ they are null).
+ </p>
+ </section>
+ <section>
+ <title>Serializing</title>
+ <p>
+ Now let's serialize our <code>User</code>s to disk.
+ </p>
+ <source>
+// Serialize user1 and user2 to disk
+File file = new File("users.avro");
+DatumWriter&#60;User&#62; userDatumWriter = new SpecificDatumWriter&#60;User&#62;(User.class);
+DataFileWriter&#60;User&#62; dataFileWriter = new DataFileWriter&#60;User&#62;(userDatumWriter);
+dataFileWriter.create(user1.getSchema(), new File("users.avro"));
+dataFileWriter.append(user1);
+dataFileWriter.append(user2);
+dataFileWriter.append(user3);
+dataFileWriter.close();
+ </source>
+ <p>
+ We create a <code>DatumWriter</code>, which converts Java objects into
+ an in-memory serialized format. The <code>SpecificDatumWriter</code>
+ class is used with generated classes and extracts the schema from the
+ specified generated type.
+ </p>
+ <p>
+ Next we create a <code>DataFileWriter</code>, which writes the
+ serialized records, as well as the schema, to the file specified in the
+ <code>dataFileWriter.create</code> call. We write our users to the file
+ via calls to the <code>dataFileWriter.append</code> method. When we are
+ done writing, we close the data file.
+ </p>
+ </section>
+ <section>
+ <title>Deserializing</title>
+ <p>
+ Finally, let's deserialize the data file we just created.
+ </p>
+ <source>
+// Deserialize Users from disk
+DatumReader&#60;User&#62; userDatumReader = new SpecificDatumReader&#60;User&#62;(User.class);
+DataFileReader&#60;User&#62; dataFileReader = new DataFileReader&#60;User&#62;(file, userDatumReader);
+User user = null;
+while (dataFileReader.hasNext()) {
+// Reuse user object by passing it to next(). This saves us from
+// allocating and garbage collecting many objects for files with
+// many items.
+user = dataFileReader.next(user);
+System.out.println(user);
+}
+ </source>
+ <p>
+ This snippet will output:
+ </p>
+ <source>
+{"name": "Alyssa", "favorite_number": 256, "favorite_color": null}
+{"name": "Ben", "favorite_number": 7, "favorite_color": "red"}
+{"name": "Charlie", "favorite_number": null, "favorite_color": "blue"}
+ </source>
+ <p>
+ Deserializing is very similar to serializing. We create a
+ <code>SpecificDatumReader</code>, analogous to the
+ <code>SpecificDatumWriter</code> we used in serialization, which
+ converts in-memory serialized items into instances of our generated
+ class, in this case <code>User</code>. We pass the
+ <code>DatumReader</code> and the previously created <code>File</code>
+ to a <code>DataFileReader</code>, analogous to the
+ <code>DataFileWriter</code>, which reads the data file on disk.
+ </p>
+ <p>
+ Next we use the <code>DataFileReader</code> to iterate through the
+ serialized <code>User</code>s and print the deserialized object to
+ stdout. Note how we perform the iteration: we create a single
+ <code>User</code> object which we store the current deserialized user
+ in, and pass this record object to every call of
+ <code>dataFileReader.next</code>. This is a performance optimization
+ that allows the <code>DataFileReader</code> to reuse the same
+ <code>User</code> object rather than allocating a new
+ <code>User</code> for every iteration, which can be very expensive in
+ terms of object allocation and garbage collection if we deserialize a
+ large data file. While this technique is the standard way to iterate
+ through a data file, it's also possible to use <code>for (User user :
+ dataFileReader)</code> if performance is not a concern.
+ </p>
+ </section>
+ <section>
+ <title>Compiling and running the example code</title>
+ <p>
+ This example code is included as a Maven project in the
+ <em>examples/java-example</em> directory in the Avro docs. From this
+ directory, execute the following commands to build and run the
+ example:
+ </p>
+ <source>
+$ mvn compile # includes code generation via Avro Maven plugin
+$ mvn -q exec:java -Dexec.mainClass=example.SpecificMain
+ </source>
+ </section>
+ </section>
+
+ <section>
+ <title>Serializing and deserializing without code generation</title>
+ <p>
+ Data in Avro is always stored with its corresponding schema, meaning we
+ can always read a serialized item regardless of whether we know the
+ schema ahead of time. This allows us to perform serialization and
+ deserialization without code generation.
+ </p>
+ <p>
+ Let's go over the same example as in the previous section, but without
+ using code generation: we'll create some users, serialize them to a data
+ file on disk, and then read back the file and deserialize the users
+ objects.
+ </p>
+ <section>
+ <title>Creating users</title>
+ <p>
+ First, we use a <code>Parser</code> to read our schema definition and
+ create a <code>Schema</code> object.
+ </p>
+ <source>
+Schema schema = new Parser().parse(new File("user.avsc"));
+ </source>
+ <p>
+ Using this schema, let's create some users.
+ </p>
+ <source>
+GenericRecord user1 = new GenericData.Record(schema);
+user1.put("name", "Alyssa");
+user1.put("favorite_number", 256);
+// Leave favorite color null
+
+GenericRecord user2 = new GenericData.Record(schema);
+user2.put("name", "Ben");
+user2.put("favorite_number", 7);
+user2.put("favorite_color", "red");
+ </source>
+ <p>
+ Since we're not using code generation, we use
+ <code>GenericRecord</code>s to represent users.
+ <code>GenericRecord</code> uses the schema to verify that we only
+ specify valid fields. If we try to set a non-existent field (e.g.,
+ <code>user1.put("favorite_animal", "cat")</code>), we'll get an
+ <code>AvroRuntimeException</code> when we run the program.
+ </p>
+ <p>
+ Note that we do not set <code>user1</code>'s favorite color. Since
+ that record is of type <code>["string", "null"]</code>, we can either
+ set it to a <code>string</code> or leave it <code>null</code>; it is
+ essentially optional.
+ </p>
+ </section>
+ <section>
+ <title>Serializing</title>
+ <p>
+ Now that we've created our user objects, serializing and deserializing
+ them is almost identical to the example above which uses code
+ generation. The main difference is that we use generic instead of
+ specific readers and writers.
+ </p>
+ <p>
+ First we'll serialize our users to a data file on disk.
+ </p>
+ <source>
+// Serialize user1 and user2 to disk
+File file = new File("users.avro");
+DatumWriter&#60;GenericRecord&#62; datumWriter = new GenericDatumWriter&#60;GenericRecord&#62;(schema);
+DataFileWriter&#60;GenericRecord&#62; dataFileWriter = new DataFileWriter&#60;GenericRecord&#62;(datumWriter);
+dataFileWriter.create(schema, file);
+dataFileWriter.append(user1);
+dataFileWriter.append(user2);
+dataFileWriter.close();
+ </source>
+ <p>
+ We create a <code>DatumWriter</code>, which converts Java objects into
+ an in-memory serialized format. Since we are not using code
+ generation, we create a <code>GenericDatumWriter</code>. It requires
+ the schema both to determine how to write the
+ <code>GenericRecord</code>s and to verify that all non-nullable fields
+ are present.
+ </p>
+ <p>
+ As in the code generation example, we also create a
+ <code>DataFileWriter</code>, which writes the serialized records, as
+ well as the schema, to the file specified in the
+ <code>dataFileWriter.create</code> call. We write our users to the
+ file via calls to the <code>dataFileWriter.append</code> method. When
+ we are done writing, we close the data file.
+ </p>
+ </section>
+ <section>
+ <title>Deserializing</title>
+ <p>
+ Finally, we'll deserialize the data file we just created.
+ </p>
+ <source>
+// Deserialize users from disk
+DatumReader&#60;GenericRecord&#62; datumReader = new GenericDatumReader&#60;GenericRecord&#62;(schema);
+DataFileReader&#60;GenericRecord&#62; dataFileReader = new DataFileReader&#60;GenericRecord&#62;(file, datumReader);
+GenericRecord user = null;
+while (dataFileReader.hasNext()) {
+// Reuse user object by passing it to next(). This saves us from
+// allocating and garbage collecting many objects for files with
+// many items.
+user = dataFileReader.next(user);
+System.out.println(user);
+ </source>
+ <p>This outputs:</p>
+ <source>
+{"name": "Alyssa", "favorite_number": 256, "favorite_color": null}
+{"name": "Ben", "favorite_number": 7, "favorite_color": "red"}
+ </source>
+ <p>
+ Deserializing is very similar to serializing. We create a
+ <code>GenericDatumReader</code>, analogous to the
+ <code>GenericDatumWriter</code> we used in serialization, which
+ converts in-memory serialized items into <code>GenericRecords</code>.
+ We pass the <code>DatumReader</code> and the previously created
+ <code>File</code> to a <code>DataFileReader</code>, analogous to the
+ <code>DataFileWriter</code>, which reads the data file on disk.
+ </p>
+ <p>
+ Next, we use the <code>DataFileReader</code> to iterate through the
+ serialized users and print the deserialized object to stdout. Note
+ how we perform the iteration: we create a single
+ <code>GenericRecord</code> object which we store the current
+ deserialized user in, and pass this record object to every call of
+ <code>dataFileReader.next</code>. This is a performance optimization
+ that allows the <code>DataFileReader</code> to reuse the same record
+ object rather than allocating a new <code>GenericRecord</code> for
+ every iteration, which can be very expensive in terms of object
+ allocation and garbage collection if we deserialize a large data file.
+ While this technique is the standard way to iterate through a data
+ file, it's also possible to use <code>for (GenericRecord user :
+ dataFileReader)</code> if performance is not a concern.
+ </p>
+ </section>
+ <section>
+ <title>Compiling and running the example code</title>
+ <p>
+ This example code is included as a Maven project in the
+ <em>examples/java-example</em> directory in the Avro docs. From this
+ directory, execute the following commands to build and run the
+ example:
+ </p>
+ <source>
+$ mvn compile
+$ mvn -q exec:java -Dexec.mainClass=example.GenericMain
+ </source>
+ </section>
+ </section>
+ </body>
+</document>
221 doc/src/content/xdocs/gettingstartedpython.xml
View
@@ -0,0 +1,221 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
+ "http://forrest.apache.org/dtd/document-v20.dtd" [
+ <!ENTITY % avro-entities PUBLIC "-//Apache//ENTITIES Avro//EN"
+ "../../../../build/avro.ent">
+ %avro-entities;
+]>
+<document>
+ <header>
+ <title>Apache Avro&#153; &AvroVersion; Getting Started (Python)</title>
+ </header>
+ <body>
+ <p>
+ This is a short guide for getting started with Apache Avro&#153; using
+ Python. This guide only covers using Avro for data serialization; see
+ Patrick Hunt's <a href="https://github.com/phunt/avro-rpc-quickstart">Avro
+ RPC Quick Start</a> for a good introduction to using Avro for RPC.
+ </p>
+
+ <section id="download_install">
+ <title>Download</title>
+ <p>
+ Avro implementations for C, C++, C#, Java, PHP, Python, and Ruby can be
+ downloaded from the <a
+ href="http://avro.apache.org/releases.html">Apache Avro&#153;
+ Releases</a> page. This guide uses Avro &AvroVersion;, the latest
+ version at the time of writing. Download and unzip
+ <em>avro-&AvroVersion;.tar.gz</em>, and install via <code>python
+ setup.py</code> (this will probably require root privileges). Ensure
+ that you can <code>import avro</code> from a Python prompt.
+ </p>
+ <source>
+$ tar xvf avro-&AvroVersion;.tar.gz
+$ cd avro-&AvroVersion;
+$ sudo python setup.py install
+$ python
+>>> import avro # should not raise ImportError
+ </source>
+ <p>
+ Alternatively, you may build the Avro Python library from source. From
+ your the root Avro directory, run the commands
+ </p>
+ <source>
+$ cd lang/py/
+$ ant
+$ sudo python setup.py install
+$ python
+>>> import avro # should not raise ImportError
+ </source>
+ </section>
+
+ <section>
+ <title>Defining a schema</title>
+ <p>
+ Avro schemas are defined using JSON. Schemas are composed of <a
+ href="spec.html#schema_primitive">primitive types</a>
+ (<code>null</code>, <code>boolean</code>, <code>int</code>,
+ <code>long</code>, <code>float</code>, <code>double</code>,
+ <code>bytes</code>, and <code>string</code>) and <a
+ href="spec.html#schema_complex">complex types</a> (<code>record</code>,
+ <code>enum</code>, <code>array</code>, <code>map</code>,
+ <code>union</code>, and <code>fixed</code>). You can learn more about
+ Avro schemas and types from the specification, but for now let's start
+ with a simple schema example, <em>user.avsc</em>:
+ </p>
+ <source>
+{"namespace": "example.avro",
+ "type": "record",
+ "name": "User",
+ "fields": [
+ {"name": "name", "type": "string"},
+ {"name": "favorite_number", "type": ["int", "null"]},
+ {"name": "favorite_color", "type": ["string", "null"]}
+ ]
+}
+ </source>
+ <p>
+ This schema defines a record representing a hypothetical user. (Note
+ that a schema file can only contain a single schema definition.) At
+ minimum, a record definition must include its type (<code>"type":
+ "record"</code>), a name (<code>"name": "User"</code>), and fields, in
+ this case <code>name</code>, <code>favorite_number</code>, and
+ <code>favorite_color</code>. We also define a namespace
+ (<code>"namespace": "example.avro"</code>), which together with the name
+ attribute defines the "full name" of the schema
+ (<code>example.avro.User</code> in this case).
+
+ </p>
+ <p>
+ Fields are defined via an array of objects, each of which defines a name
+ and type (other attributes are optional, see the <a
+ href="spec.html#schema_record">record specification</a> for more
+ details). The type attribute of a field is another schema object, which
+ can be either a primitive or complex type. For example, the
+ <code>name</code> field of our User schema is the primitive type
+ <code>string</code>, whereas the <code>favorite_number</code> and
+ <code>favorite_color</code> fields are both <code>union</code>s,
+ represented by JSON arrays. <code>union</code>s are a complex type that
+ can be any of the types listed in the array; e.g.,
+ <code>favorite_number</code> can either be an <code>int</code> or
+ <code>null</code>, essentially making it an optional field.
+ </p>
+ </section>
+
+ <section>
+ <title>Serializing and deserializing without code generation</title>
+ <p>
+ Data in Avro is always stored with its corresponding schema, meaning we
+ can always read a serialized item, regardless of whether we know the
+ schema ahead of time. This allows us to perform serialization and
+ deserialization without code generation. Note that the Avro Python
+ library does not support code generation.
+ </p>
+ <p>
+ Try running the following code snippet, which serializes two users to a
+ data file on disk, and then reads back and deserializes the data file:
+ </p>
+ <source>
+import avro.schema
+from avro.datafile import DataFileReader, DataFileWriter
+from avro.io import DatumReader, DatumWriter
+
+schema = avro.schema.parse(open("user.avsc").read())
+
+writer = DataFileWriter(open("users.avro", "w"), DatumWriter(), schema)
+writer.append({"name": "Alyssa", "favorite_number": 256})
+writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
+writer.close()
+
+reader = DataFileReader(open("users.avro", "r"), DatumReader())
+for user in reader:
+ print user
+reader.close()
+ </source>
+ <p>This outputs:</p>
+ <source>
+{u'favorite_color': None, u'favorite_number': 256, u'name': u'Alyssa'}
+{u'favorite_color': u'red', u'favorite_number': 7, u'name': u'Ben'}
+ </source>
+ <p>
+ Let's take a closer look at what's going on here.
+ </p>
+ <source>
+schema = avro.schema.parse(open("user.avsc").read())
+ </source>
+ <p>
+ <code>avro.schema.parse</code> takes a string containing a JSON schema
+ definition as input and outputs a <code>avro.schema.Schema</code> object
+ (specifically a subclass of <code>Schema</code>, in this case
+ <code>RecordSchema</code>). We're passing in the contents of our
+ user.avsc schema file here.
+ </p>
+ <source>
+writer = DataFileWriter(open("users.avro", "w"), DatumWriter(), schema)
+ </source>
+ <p>
+ We create a <code>DataFileWriter</code>, which we'll use to write
+ serialized items to a data file on disk. The
+ <code>DataFileWriter</code> constructor takes three arguments:
+ </p>
+ <ul>
+ <li>The file we'll serialize to</li>
+ <li>A <code>DatumWriter</code>, which is responsible for actually
+ serializing the items to Avro's binary format
+ (<code>DatumWriter</code>s can be used separately from
+ <code>DataFileWriter</code>s, e.g., to perform IPC with Avro
+ <strong>TODO: is this true??</strong>).</li>
+ <li>The schema we're using. The <code>DataFileWriter</code> needs the
+ schema both to write the schema to the data file, and to verify that
+ the items we write are valid items and write the appropriate
+ fields.</li>
+ </ul>
+ <source>
+writer.append({"name": "Alyssa", "favorite_number": 256})
+writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
+ </source>
+ <p>
+ We use <code>DataFileWriter.append</code> to add items to our data
+ file. Avro records are represented as Python <code>dict</code>s.
+ Since the field <code>favorite_color</code> has type <code>["int",
+ "null"]</code>, we are not required to specify this field, as shown in
+ the first append. Were we to omit the required <code>name</code>
+ field, an exception would be raised. Any extra entries not
+ corresponding to a field are present in the <code>dict</code> are
+ ignored.
+ </p>
+ <source>
+reader = DataFileReader(open("users.avro", "r"), DatumReader())
+ </source>
+ <p>
+ We open the file again, this time for reading back from disk. We use
+ a <code>DataFileReader</code> and <code>DatumReader</code> analagous
+ to the <code>DataFileWriter</code> and <code>DatumWriter</code> above.
+ </p>
+ <source>
+for user in reader:
+ print user
+ </source>
+ <p>
+ The <code>DataFileReader</code> is an iterator that returns
+ <code>dict</code>s corresponding to the serialized items.
+ </p>
+ </section>
+ </body>
+</document>
2  doc/src/content/xdocs/site.xml
View
@@ -41,6 +41,8 @@ See http://forrest.apache.org/docs/linking.html for more info
<docs label="Documentation">
<overview label="Overview" href="index.html" />
+ <gettingstartedjava label="Getting started (Java)" href="gettingstartedjava.html" />
+ <gettingstartedpython label="Getting started (Python)" href="gettingstartedpython.html" />
<spec label="Specification" href="spec.html" />
<trevni label="Trevni" href="ext:trevni/spec" />
<java-api label="Java API" href="ext:api/java/index" />
Please sign in to comment.
Something went wrong with that request. Please try again.