From 75ad55211397345e5192a19513d478b913028506 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sat, 2 Aug 2014 19:13:21 -0400
Subject: [PATCH 01/23] make check output style consistent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The RAT and PEP8 checks don’t print a blank line after successful runs.
The scalastyle check shouldn’t either.
---
 dev/scalastyle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/scalastyle b/dev/scalastyle
index d9f2b91a3a091..b53053a04ff42 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -30,5 +30,5 @@ if test ! -z "$ERRORS"; then
     echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS"
     exit 1
 else
-    echo -e "Scalastyle checks passed.\n"
+    echo -e "Scalastyle checks passed."
 fi

From 61c07b911b033b0356326397a6b7168dc94e6632 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sat, 2 Aug 2014 19:15:49 -0400
Subject: [PATCH 02/23] [SPARK-2627] add Python linter

This guy just runs the pep8 utility on all code in the python
directory, minus cloudpickle, which is a 3rd-party library.
---
 dev/lint-python | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100755 dev/lint-python

diff --git a/dev/lint-python b/dev/lint-python
new file mode 100755
index 0000000000000..5f58015e1ecde
--- /dev/null
+++ b/dev/lint-python
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
+PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
+
+cd $SPARK_ROOT_DIR
+# There is no need to write this output to a file
+#+ first, but we do so so that the check status can
+#+ be output before the report, like with the
+#+ scalastyle and RAT checks.
+pep8 ./python --exclude="cloudpickle.py" \
+    > "$PEP8_REPORT_PATH"
+pep8_status=${PIPESTATUS[0]} #$?
+
+if [ $pep8_status ]
+    then
+        echo "PEP8 checks failed."
+        cat "$PEP8_REPORT_PATH"
+        exit 1
+    else
+        echo "PEP8 checks passed."
+fi
+
+rm -f "$PEP8_REPORT_PATH"

From 12440faa1a0a8b7eb41c5778ea49c360cb0de532 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sat, 2 Aug 2014 19:23:06 -0400
Subject: [PATCH 03/23] [SPARK-2627] add Scala linter

This guy just calls scalastyle.
---
 dev/lint-scala | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100755 dev/lint-scala

diff --git a/dev/lint-scala b/dev/lint-scala
new file mode 100755
index 0000000000000..c676dfdf4f44e
--- /dev/null
+++ b/dev/lint-scala
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
+
+"$SCRIPT_DIR/scalastyle"

From 0541ebb5f30973aa54157256b4e43decfa9de8ba Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sat, 2 Aug 2014 19:23:30 -0400
Subject: [PATCH 04/23] [SPARK-2627] call Python linter from run-tests

---
 dev/run-tests | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests b/dev/run-tests
index daa85bc750c07..59904ba6b6313 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -71,7 +71,12 @@ dev/check-license
 echo "========================================================================="
 echo "Running Scala style checks"
 echo "========================================================================="
-dev/scalastyle
+dev/lint-scala
+
+echo "========================================================================="
+echo "Running Python style checks"
+echo "========================================================================="
+dev/lint-python
 
 echo "========================================================================="
 echo "Running Spark unit tests"

From 723ed39fb5067e3827a3e1f5434fb111b8e498cc Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sat, 2 Aug 2014 19:24:00 -0400
Subject: [PATCH 05/23] always delete the report file

---
 dev/lint-python | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/lint-python b/dev/lint-python
index 5f58015e1ecde..7a79644038d7d 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -34,9 +34,9 @@ if [ $pep8_status ]
     then
         echo "PEP8 checks failed."
         cat "$PEP8_REPORT_PATH"
-        exit 1
     else
         echo "PEP8 checks passed."
 fi
 
 rm -f "$PEP8_REPORT_PATH"
+exit $pep8_status

From beaa9ac5154ec13c0067ada286327a615022bd59 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 00:05:33 -0400
Subject: [PATCH 06/23] [SPARK-2627] fail check on non-zero status

---
 dev/lint-python | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/lint-python b/dev/lint-python
index 7a79644038d7d..3105e64d33d4b 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -30,7 +30,7 @@ pep8 ./python --exclude="cloudpickle.py" \
     > "$PEP8_REPORT_PATH"
 pep8_status=${PIPESTATUS[0]} #$?
 
-if [ $pep8_status ]
+if [ $pep8_status -ne 0 ]
     then
         echo "PEP8 checks failed."
         cat "$PEP8_REPORT_PATH"

From a31ccc442fe0fa220eeac5c118d3045bb3d83142 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 00:06:18 -0400
Subject: [PATCH 07/23] [SPARK-2627] miscellaneous PEP 8 fixes

Mostly done using autopep8, plus some hand fixes.
---
 python/pyspark/__init__.py             |   3 +-
 python/pyspark/accumulators.py         |  19 ++-
 python/pyspark/broadcast.py            |   1 +
 python/pyspark/conf.py                 |   4 +-
 python/pyspark/context.py              |  97 +++++++----
 python/pyspark/daemon.py               |  12 +-
 python/pyspark/files.py                |   1 +
 python/pyspark/java_gateway.py         |   7 +-
 python/pyspark/mllib/_common.py        |  38 +++--
 python/pyspark/mllib/classification.py |  14 +-
 python/pyspark/mllib/clustering.py     |   6 +-
 python/pyspark/mllib/linalg.py         |  17 +-
 python/pyspark/mllib/random.py         |  26 +--
 python/pyspark/mllib/recommendation.py |   5 +-
 python/pyspark/mllib/regression.py     |  18 ++-
 python/pyspark/mllib/stat.py           |  10 +-
 python/pyspark/mllib/tests.py          |  36 +++--
 python/pyspark/mllib/util.py           |  10 +-
 python/pyspark/rdd.py                  |  18 ++-
 python/pyspark/rddsampler.py           |  10 +-
 python/pyspark/resultiterable.py       |   2 +
 python/pyspark/serializers.py          |   9 ++
 python/pyspark/shell.py                |   3 +-
 python/pyspark/shuffle.py              |  20 +--
 python/pyspark/sql.py                  |  64 +++++---
 python/pyspark/statcounter.py          |  12 +-
 python/pyspark/storagelevel.py         |   1 +
 python/pyspark/tests.py                | 212 +++++++++++++++----------
 python/pyspark/worker.py               |   6 +-
 python/test_support/userlibrary.py     |   2 +
 30 files changed, 452 insertions(+), 231 deletions(-)

diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index c58555fc9d2c5..ac55fcdc2057b 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -53,7 +53,8 @@
 # mllib that depend on top level pyspark packages, which transitively depend on python's random.
 # Since Python's import logic looks for modules in the current package first, we eliminate
 # mllib.random as a candidate for C{import random} by removing the first search path, the script's
-# location, in order to force the loader to look in Python's top-level modules for C{random}.
+# location, in order to force the loader to look in Python's top-level
+# modules for C{random}.
 import sys
 s = sys.path.pop(0)
 import random
diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index 45d36e5d0e764..f85e2501f9b03 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -97,7 +97,8 @@
 pickleSer = PickleSerializer()
 
 # Holds accumulators registered on the current machine, keyed by ID. This is then used to send
-# the local accumulator updates back to the driver program at the end of a task.
+# the local accumulator updates back to the driver program at the end of a
+# task.
 _accumulatorRegistry = {}
 
 
@@ -110,6 +111,7 @@ def _deserialize_accumulator(aid, zero_value, accum_param):
 
 
 class Accumulator(object):
+
     """
     A shared variable that can be accumulated, i.e., has a commutative and associative "add"
     operation. Worker tasks on a Spark cluster can add values to an Accumulator with the C{+=}
@@ -139,14 +141,16 @@ def __reduce__(self):
     def value(self):
         """Get the accumulator's value; only usable in driver program"""
         if self._deserialized:
-            raise Exception("Accumulator.value cannot be accessed inside tasks")
+            raise Exception(
+                "Accumulator.value cannot be accessed inside tasks")
         return self._value
 
     @value.setter
     def value(self, value):
         """Sets the accumulator's value; only usable in driver program"""
         if self._deserialized:
-            raise Exception("Accumulator.value cannot be accessed inside tasks")
+            raise Exception(
+                "Accumulator.value cannot be accessed inside tasks")
         self._value = value
 
     def add(self, term):
@@ -166,6 +170,7 @@ def __repr__(self):
 
 
 class AccumulatorParam(object):
+
     """
     Helper object that defines how to accumulate values of a given type.
     """
@@ -186,6 +191,7 @@ def addInPlace(self, value1, value2):
 
 
 class AddingAccumulatorParam(AccumulatorParam):
+
     """
     An AccumulatorParam that uses the + operators to add values. Designed for simple types
     such as integers, floats, and lists. Requires the zero value for the underlying type
@@ -210,6 +216,7 @@ def addInPlace(self, value1, value2):
 
 
 class _UpdateRequestHandler(SocketServer.StreamRequestHandler):
+
     """
     This handler will keep polling updates from the same socket until the
     server is shutdown.
@@ -218,7 +225,8 @@ class _UpdateRequestHandler(SocketServer.StreamRequestHandler):
     def handle(self):
         from pyspark.accumulators import _accumulatorRegistry
         while not self.server.server_shutdown:
-            # Poll every 1 second for new data -- don't block in case of shutdown.
+            # Poll every 1 second for new data -- don't block in case of
+            # shutdown.
             r, _, _ = select.select([self.rfile], [], [], 1)
             if self.rfile in r:
                 num_updates = read_int(self.rfile)
@@ -228,7 +236,9 @@ def handle(self):
                 # Write a byte in acknowledgement
                 self.wfile.write(struct.pack("!b", 1))
 
+
 class AccumulatorServer(SocketServer.TCPServer):
+
     """
     A simple TCP server that intercepts shutdown() in order to interrupt
     our continuous polling on the handler.
@@ -239,6 +249,7 @@ def shutdown(self):
         self.server_shutdown = True
         SocketServer.TCPServer.shutdown(self)
 
+
 def _start_update_server():
     """Start a TCP server to receive accumulator updates in a daemon thread, and returns it"""
     server = AccumulatorServer(("localhost", 0), _UpdateRequestHandler)
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index 43f40f8783bfd..f3e64989ed564 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -45,6 +45,7 @@ def _from_id(bid):
 
 
 class Broadcast(object):
+
     """
     A broadcast variable created with
     L{SparkContext.broadcast()<pyspark.context.SparkContext.broadcast>}.
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index b4c82f519bd53..ebfc0ad003b3b 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -56,6 +56,7 @@
 
 
 class SparkConf(object):
+
     """
     Configuration for a Spark application. Used to set various Spark
     parameters as key-value pairs.
@@ -124,7 +125,8 @@ def setSparkHome(self, value):
     def setExecutorEnv(self, key=None, value=None, pairs=None):
         """Set an environment variable to be passed to executors."""
         if (key is not None and pairs is not None) or (key is None and pairs is None):
-            raise Exception("Either pass one key-value pair or a list of pairs")
+            raise Exception(
+                "Either pass one key-value pair or a list of pairs")
         elif key is not None:
             self._jconf.setExecutorEnv(key, value)
         elif pairs is not None:
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 2e80eb50f2207..3ecd9a58f5b74 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -47,6 +47,7 @@
 
 
 class SparkContext(object):
+
     """
     Main entry point for Spark functionality. A SparkContext represents the
     connection to a Spark cluster, and can be used to create L{RDD}s and
@@ -59,7 +60,8 @@ class SparkContext(object):
     _next_accum_id = 0
     _active_spark_context = None
     _lock = Lock()
-    _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
+    # zip and egg files that need to be added to PYTHONPATH
+    _python_includes = None
     _default_batch_size_for_serialized_input = 10
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
@@ -99,13 +101,15 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
             self._callsite = rdd._extract_concise_traceback()
         else:
             tempNamedTuple = namedtuple("Callsite", "function file linenum")
-            self._callsite = tempNamedTuple(function=None, file=None, linenum=None)
+            self._callsite = tempNamedTuple(
+                function=None, file=None, linenum=None)
         SparkContext._ensure_initialized(self, gateway=gateway)
         try:
             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                           conf)
         except:
-            # If an error occurs, clean up in order to allow future SparkContext creation:
+            # If an error occurs, clean up in order to allow future
+            # SparkContext creation:
             self.stop()
             raise
 
@@ -138,7 +142,8 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         if not self._conf.contains("spark.master"):
             raise Exception("A master URL must be set in your configuration")
         if not self._conf.contains("spark.app.name"):
-            raise Exception("An application name must be set in your configuration")
+            raise Exception(
+                "An application name must be set in your configuration")
 
         # Read back our properties from the conf in case we loaded some of them from
         # the classpath or an external config file
@@ -179,7 +184,8 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
             self.addPyFile(path)
 
         # Deploy code dependencies set by spark-submit; these will already have been added
-        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
+        # with SparkContext.addFile, so we just need to add them to the
+        # PYTHONPATH
         for path in self._conf.get("spark.submit.pyFiles", "").split(","):
             if path != "":
                 (dirname, filename) = os.path.split(path)
@@ -189,9 +195,11 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
                     sys.path.append(dirname)
 
         # Create a temporary directory inside spark.local.dir:
-        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
+        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
+            self._jsc.sc().conf())
         self._temp_dir = \
-            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
+            self._jvm.org.apache.spark.util.Utils.createTempDir(
+                local_dir).getAbsolutePath()
 
     def _initialize_context(self, jconf):
         """
@@ -213,7 +221,7 @@ def _ensure_initialized(cls, instance=None, gateway=None):
 
             if instance:
                 if (SparkContext._active_spark_context and
-                   SparkContext._active_spark_context != instance):
+                        SparkContext._active_spark_context != instance):
                     currentMaster = SparkContext._active_spark_context.master
                     currentAppName = SparkContext._active_spark_context.appName
                     callsite = SparkContext._active_spark_context._callsite
@@ -284,7 +292,8 @@ def parallelize(self, c, numSlices=None):
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
         tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
-        # Make sure we distribute data evenly if it's smaller than self.batchSize
+        # Make sure we distribute data evenly if it's smaller than
+        # self.batchSize
         if "__len__" not in dir(c):
             c = list(c)    # Make it a list so we can compute its length
         batchSize = min(len(c) // numSlices, self._batchSize)
@@ -403,10 +412,12 @@ def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         minSplits = minSplits or min(self.defaultParallelism, 2)
-        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
+        batchSize = max(
+            1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (
+            batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass,
-                    keyConverter, valueConverter, minSplits, batchSize)
+                                                keyConverter, valueConverter, minSplits, batchSize)
         return RDD(jrdd, self, ser)
 
     def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -434,10 +445,13 @@ def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConv
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,
-                    valueClass, keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(
+            1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (
+            batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.newAPIHadoopFile(
+            self._jsc, path, inputFormatClass, keyClass,
+            valueClass, keyConverter, valueConverter, jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -462,10 +476,13 @@ def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=N
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
-                    valueClass, keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(
+            1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (
+            batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(
+            self._jsc, inputFormatClass, keyClass,
+            valueClass, keyConverter, valueConverter, jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -493,10 +510,13 @@ def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass,
-                    valueClass, keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(
+            1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (
+            batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.hadoopFile(
+            self._jsc, path, inputFormatClass, keyClass,
+            valueClass, keyConverter, valueConverter, jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -521,10 +541,12 @@ def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
+        batchSize = max(
+            1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (
+            batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass,
-                    keyConverter, valueConverter, jconf, batchSize)
+                                             keyConverter, valueConverter, jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def _checkpointFile(self, name, input_deserializer):
@@ -587,7 +609,8 @@ def accumulator(self, value, accum_param=None):
             elif isinstance(value, complex):
                 accum_param = accumulators.COMPLEX_ACCUMULATOR_PARAM
             else:
-                raise Exception("No default accumulator param for type %s" % type(value))
+                raise Exception(
+                    "No default accumulator param for type %s" % type(value))
         SparkContext._next_accum_id += 1
         return Accumulator(SparkContext._next_accum_id - 1, value, accum_param)
 
@@ -632,12 +655,14 @@ def addPyFile(self, path):
         HTTP, HTTPS or FTP URI.
         """
         self.addFile(path)
-        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
+        # dirname may be directory or HDFS/S3 prefix
+        (dirname, filename) = os.path.split(path)
 
         if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
             self._python_includes.append(filename)
             # for tests in local mode
-            sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename))
+            sys.path.append(
+                os.path.join(SparkFiles.getRootDirectory(), filename))
 
     def setCheckpointDir(self, dirName):
         """
@@ -651,7 +676,8 @@ def _getJavaStorageLevel(self, storageLevel):
         Returns a Java StorageLevel based on a pyspark.StorageLevel.
         """
         if not isinstance(storageLevel, StorageLevel):
-            raise Exception("storageLevel must be of type pyspark.StorageLevel")
+            raise Exception(
+                "storageLevel must be of type pyspark.StorageLevel")
 
         newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel
         return newStorageLevel(storageLevel.useDisk,
@@ -754,13 +780,15 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         """
         if partitions is None:
             partitions = range(rdd._jrdd.partitions().size())
-        javaPartitions = ListConverter().convert(partitions, self._gateway._gateway_client)
+        javaPartitions = ListConverter().convert(
+            partitions, self._gateway._gateway_client)
 
         # Implementation note: This is implemented as a mapPartitions followed
         # by runJob() in order to avoid having to pass a Python lambda into
         # SparkContext#runJob.
         mappedRDD = rdd.mapPartitions(partitionFunc)
-        it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
+        it = self._jvm.PythonRDD.runJob(
+            self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
         return list(mappedRDD._collect_iterator_through_file(it))
 
 
@@ -772,7 +800,8 @@ def _test():
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
     globs['tempdir'] = tempfile.mkdtemp()
     atexit.register(lambda: shutil.rmtree(globs['tempdir']))
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
index 9fde0dde0f4b4..73fb691ea3bf3 100644
--- a/python/pyspark/daemon.py
+++ b/python/pyspark/daemon.py
@@ -30,7 +30,8 @@
 
 
 def compute_real_exit_code(exit_code):
-    # SystemExit's code can be integer or string, but os._exit only accepts integers
+    # SystemExit's code can be integer or string, but os._exit only accepts
+    # integers
     if isinstance(exit_code, numbers.Integral):
         return exit_code
     else:
@@ -43,7 +44,8 @@ def worker(sock):
     """
     # Redirect stdout to stderr
     os.dup2(2, 1)
-    sys.stdout = sys.stderr   # The sys.stdout object is different from file descriptor 1
+    # The sys.stdout object is different from file descriptor 1
+    sys.stdout = sys.stderr
 
     signal.signal(SIGHUP, SIG_DFL)
     signal.signal(SIGCHLD, SIG_DFL)
@@ -62,7 +64,8 @@ def waitSocketClose(sock):
 
     # Read the socket using fdopen instead of socket.makefile() because the latter
     # seems to be very slow; note that we need to dup() the file descriptor because
-    # otherwise writes also cause a seek that makes us miss data on the read side.
+    # otherwise writes also cause a seek that makes us miss data on the read
+    # side.
     infile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
     outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
     exit_code = 0
@@ -106,7 +109,8 @@ def handle_sigchld(*args):
         try:
             pid, status = os.waitpid(0, os.WNOHANG)
             if status != 0:
-                msg = "worker %s crashed abruptly with exit status %s" % (pid, status)
+                msg = "worker %s crashed abruptly with exit status %s" % (
+                    pid, status)
                 print >> sys.stderr, msg
         except EnvironmentError as err:
             if err.errno not in (ECHILD, EINTR):
diff --git a/python/pyspark/files.py b/python/pyspark/files.py
index 57ee14eeb7776..331de9a9b2212 100644
--- a/python/pyspark/files.py
+++ b/python/pyspark/files.py
@@ -19,6 +19,7 @@
 
 
 class SparkFiles(object):
+
     """
     Resolves paths to files added through
     L{SparkContext.addFile()<pyspark.context.SparkContext.addFile>}.
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 2c129679f47f3..2be2282f0c80b 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -39,12 +39,14 @@ def launch_gateway():
         submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
         submit_args = submit_args if submit_args is not None else ""
         submit_args = shlex.split(submit_args)
-        command = [os.path.join(SPARK_HOME, script), "pyspark-shell"] + submit_args
+        command = [
+            os.path.join(SPARK_HOME, script), "pyspark-shell"] + submit_args
         if not on_windows:
             # Don't send ctrl-c / SIGINT to the Java gateway:
             def preexec_func():
                 signal.signal(signal.SIGINT, signal.SIG_IGN)
-            proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
+            proc = Popen(
+                command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
         else:
             # preexec_fn not supported on Windows
             proc = Popen(command, stdout=PIPE, stdin=PIPE)
@@ -65,6 +67,7 @@ def preexec_func():
         # Create a thread to echo output from the GatewayServer, which is required
         # for Java log output to show up:
         class EchoOutputThread(Thread):
+
             def __init__(self, stream):
                 Thread.__init__(self)
                 self.daemon = True
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index c6ca6a75df746..fca23111d3c2b 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -35,7 +35,8 @@
 
 
 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
-# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
+# such as _dot and _serialize_double_vector, start to support scipy.sparse
+# matrices.
 
 _have_scipy = False
 _scipy_issparse = None
@@ -72,9 +73,9 @@
 # Python interpreter must agree on what endian the machine is.
 
 
-DENSE_VECTOR_MAGIC  = 1
+DENSE_VECTOR_MAGIC = 1
 SPARSE_VECTOR_MAGIC = 2
-DENSE_MATRIX_MAGIC  = 3
+DENSE_MATRIX_MAGIC = 3
 LABELED_POINT_MAGIC = 4
 
 
@@ -164,7 +165,8 @@ def _serialize_sparse_vector(v):
     header[1] = nonzeros
     _copyto(v.indices, buffer=ba, offset=9, shape=[nonzeros], dtype=int32)
     values_offset = 9 + 4 * nonzeros
-    _copyto(v.values, buffer=ba, offset=values_offset, shape=[nonzeros], dtype=float64)
+    _copyto(v.values, buffer=ba, offset=values_offset,
+            shape=[nonzeros], dtype=float64)
     return ba
 
 
@@ -188,9 +190,11 @@ def _deserialize_double(ba, offset=0):
     True
     """
     if type(ba) != bytearray:
-        raise TypeError("_deserialize_double called on a %s; wanted bytearray" % type(ba))
+        raise TypeError(
+            "_deserialize_double called on a %s; wanted bytearray" % type(ba))
     if len(ba) - offset != 8:
-        raise TypeError("_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
+        raise TypeError(
+            "_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
     return struct.unpack("d", ba[offset:])[0]
 
 
@@ -246,7 +250,8 @@ def _deserialize_sparse_vector(ba, offset=0):
         raise TypeError("_deserialize_sparse_vector called on bytearray "
                         "with wrong length")
     indices = _deserialize_numpy_array([nonzeros], ba, offset + 9, dtype=int32)
-    values = _deserialize_numpy_array([nonzeros], ba, offset + 9 + 4 * nonzeros, dtype=float64)
+    values = _deserialize_numpy_array(
+        [nonzeros], ba, offset + 9 + 4 * nonzeros, dtype=float64)
     return SparseVector(int(size), indices, values)
 
 
@@ -325,7 +330,8 @@ def _deserialize_labeled_point(ba, offset=0):
     if type(ba) != bytearray:
         raise TypeError("Expecting a bytearray but got %s" % type(ba))
     if ba[offset] != LABELED_POINT_MAGIC:
-        raise TypeError("Expecting magic number %d but got %d" % (LABELED_POINT_MAGIC, ba[0]))
+        raise TypeError("Expecting magic number %d but got %d" %
+                        (LABELED_POINT_MAGIC, ba[0]))
     label = ndarray(shape=[1], buffer=ba, offset=offset + 1, dtype=float64)[0]
     features = _deserialize_double_vector(ba, offset + 9)
     return LabeledPoint(label, features)
@@ -339,7 +345,8 @@ def _copyto(array, buffer, offset, shape, dtype):
     TODO: In the future this could use numpy.copyto on NumPy 1.7+, but
     we should benchmark that to see whether it provides a benefit.
     """
-    temp_array = ndarray(shape=shape, buffer=buffer, offset=offset, dtype=dtype, order='C')
+    temp_array = ndarray(
+        shape=shape, buffer=buffer, offset=offset, dtype=dtype, order='C')
     temp_array[...] = array
 
 
@@ -356,7 +363,8 @@ def _get_unmangled_double_vector_rdd(data):
     return _get_unmangled_rdd(data, _serialize_double_vector)
 
 
-# Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points
+# Map a pickled Python RDD of LabeledPoint to a Java RDD of
+# _serialized_labeled_points
 def _get_unmangled_labeled_point_rdd(data):
     return _get_unmangled_rdd(data, _serialize_labeled_point)
 
@@ -383,7 +391,8 @@ def _linear_predictor_typecheck(x, coeffs):
     elif (type(x) == RDD):
         raise RuntimeError("Bulk predict not yet supported.")
     else:
-        raise TypeError("Argument of type " + type(x).__name__ + " unsupported")
+        raise TypeError(
+            "Argument of type " + type(x).__name__ + " unsupported")
 
 
 # If we weren't given initial weights, take a zero vector of the appropriate
@@ -430,6 +439,7 @@ def _serialize_rating(r):
 
 
 class RatingDeserializer(Serializer):
+
     def loads(self, stream):
         length = struct.unpack("!i", stream.read(4))[0]
         ba = stream.read(length)
@@ -478,7 +488,8 @@ def _convert_vector(vec):
             assert vec.shape[1] == 1, "Expected column vector"
             csc = vec.tocsc()
             return SparseVector(vec.shape[0], csc.indices, csc.data)
-    raise TypeError("Expected NumPy array, SparseVector, or scipy.sparse matrix")
+    raise TypeError(
+        "Expected NumPy array, SparseVector, or scipy.sparse matrix")
 
 
 def _squared_distance(v1, v2):
@@ -531,7 +542,8 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 2bbb9c3fca315..2e31665105c62 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -31,6 +31,7 @@
 
 
 class LogisticRegressionModel(LinearModel):
+
     """A linear binary classification model derived from logistic regression.
 
     >>> data = [
@@ -60,6 +61,7 @@ class LogisticRegressionModel(LinearModel):
     >>> lrm.predict(SparseVector(2, {1: 0.0})) <= 0
     True
     """
+
     def predict(self, x):
         _linear_predictor_typecheck(x, self._coeff)
         margin = _dot(x, self._coeff) + self._intercept
@@ -72,6 +74,7 @@ def predict(self, x):
 
 
 class LogisticRegressionWithSGD(object):
+
     @classmethod
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None):
         """Train a logistic regression model on the given data."""
@@ -83,6 +86,7 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWei
 
 
 class SVMModel(LinearModel):
+
     """A support vector machine.
 
     >>> data = [
@@ -106,6 +110,7 @@ class SVMModel(LinearModel):
     >>> svm.predict(SparseVector(2, {0: -1.0})) <= 0
     True
     """
+
     def predict(self, x):
         _linear_predictor_typecheck(x, self._coeff)
         margin = _dot(x, self._coeff) + self._intercept
@@ -113,6 +118,7 @@ def predict(self, x):
 
 
 class SVMWithSGD(object):
+
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=1.0,
               miniBatchFraction=1.0, initialWeights=None):
@@ -124,6 +130,7 @@ def train(cls, data, iterations=100, step=1.0, regParam=1.0,
 
 
 class NaiveBayesModel(object):
+
     """
     Model for Naive Bayes classifiers.
 
@@ -164,6 +171,7 @@ def predict(self, x):
 
 
 class NaiveBayes(object):
+
     @classmethod
     def train(cls, data, lambda_=1.0):
         """
@@ -182,7 +190,8 @@ def train(cls, data, lambda_=1.0):
         """
         sc = data.context
         dataBytes = _get_unmangled_labeled_point_rdd(data)
-        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_)
+        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(
+            dataBytes._jrdd, lambda_)
         return NaiveBayesModel(
             _deserialize_double_vector(ans[0]),
             _deserialize_double_vector(ans[1]),
@@ -193,7 +202,8 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index b380e8f6c8725..7e974d7186bf7 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -27,6 +27,7 @@
 
 
 class KMeansModel(object):
+
     """A clustering model derived from the k-means method.
 
     >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4,2)
@@ -55,6 +56,7 @@ class KMeansModel(object):
     >>> type(model.clusterCenters)
     <type 'list'>
     """
+
     def __init__(self, centers):
         self.centers = centers
 
@@ -76,6 +78,7 @@ def predict(self, x):
 
 
 class KMeans(object):
+
     @classmethod
     def train(cls, data, k, maxIterations=100, runs=1, initializationMode="k-means||"):
         """Train a k-means clustering model."""
@@ -96,7 +99,8 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 54720c2324ca6..41b1f2976893b 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -27,6 +27,7 @@
 
 
 class SparseVector(object):
+
     """
     A simple sparse vector class for passing data to MLlib. Users may
     alternatively pass SciPy's {scipy.sparse} data types.
@@ -59,7 +60,8 @@ def __init__(self, size, *args):
             self.indices = array([p[0] for p in pairs], dtype=int32)
             self.values = array([p[1] for p in pairs], dtype=float64)
         else:
-            assert len(args[0]) == len(args[1]), "index and value arrays not same length"
+            assert len(args[0]) == len(
+                args[1]), "index and value arrays not same length"
             self.indices = array(args[0], dtype=int32)
             self.values = array(args[1], dtype=float64)
             for i in xrange(len(self.indices) - 1):
@@ -88,10 +90,12 @@ def dot(self, other):
                     result += self.values[i] * other[self.indices[i]]
                 return result
             elif other.ndim == 2:
-                results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
+                results = [self.dot(other[:, i])
+                           for i in xrange(other.shape[1])]
                 return array(results)
             else:
-                raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
+                raise Exception(
+                    "Cannot call dot with %d-dimensional array" % other.ndim)
         else:
             result = 0.0
             i, j = 0, 0
@@ -167,7 +171,8 @@ def __str__(self):
     def __repr__(self):
         inds = self.indices
         vals = self.values
-        entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
+        entries = ", ".join(
+            ["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
@@ -192,6 +197,7 @@ def __ne__(self, other):
 
 
 class Vectors(object):
+
     """
     Factory methods for working with vectors. Note that dense vectors
     are simply represented as NumPy array objects, so there is no need
@@ -256,7 +262,8 @@ def _test():
 
 if __name__ == "__main__":
     # remove current path from list of search paths to avoid importing mllib.random
-    # for C{import random}, which is done in an external dependency of pyspark during doctests.
+    # for C{import random}, which is done in an external dependency of pyspark
+    # during doctests.
     import sys
     sys.path.pop(0)
     _test()
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 36e710dbae7a8..1b51da913c4b3 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -24,7 +24,9 @@
 from pyspark.mllib._common import _deserialize_double, _deserialize_double_vector
 from pyspark.serializers import NoOpSerializer
 
+
 class RandomRDDGenerators:
+
     """
     Generator methods for creating RDDs comprised of i.i.d samples from
     some distribution.
@@ -52,8 +54,9 @@ def uniformRDD(sc, size, numPartitions=None, seed=None):
         >>> parts == sc.defaultParallelism
         True
         """
-        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
-        uniform =  RDD(jrdd, sc, NoOpSerializer())
+        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(
+            sc._jsc, size, numPartitions, seed)
+        uniform = RDD(jrdd, sc, NoOpSerializer())
         return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
 
     @staticmethod
@@ -76,8 +79,9 @@ def normalRDD(sc, size, numPartitions=None, seed=None):
         >>> abs(stats.stdev() - 1.0) < 0.1
         True
         """
-        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
-        normal =  RDD(jrdd, sc, NoOpSerializer())
+        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(
+            sc._jsc, size, numPartitions, seed)
+        normal = RDD(jrdd, sc, NoOpSerializer())
         return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
 
     @staticmethod
@@ -97,8 +101,9 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         >>> abs(stats.stdev() - sqrt(mean)) < 0.5
         True
         """
-        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
-        poisson =  RDD(jrdd, sc, NoOpSerializer())
+        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(
+            sc._jsc, mean, size, numPartitions, seed)
+        poisson = RDD(jrdd, sc, NoOpSerializer())
         return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
 
     @staticmethod
@@ -118,7 +123,7 @@ def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         jrdd = sc._jvm.PythonMLLibAPI() \
             .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
-        uniform =  RDD(jrdd, sc, NoOpSerializer())
+        uniform = RDD(jrdd, sc, NoOpSerializer())
         return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
 
     @staticmethod
@@ -138,7 +143,7 @@ def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         jrdd = sc._jvm.PythonMLLibAPI() \
             .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
-        normal =  RDD(jrdd, sc, NoOpSerializer())
+        normal = RDD(jrdd, sc, NoOpSerializer())
         return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
 
     @staticmethod
@@ -161,7 +166,7 @@ def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
         """
         jrdd = sc._jvm.PythonMLLibAPI() \
             .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
-        poisson =  RDD(jrdd, sc, NoOpSerializer())
+        poisson = RDD(jrdd, sc, NoOpSerializer())
         return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
 
 
@@ -172,7 +177,8 @@ def _test():
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 6c385042ffa5f..c398b008fda54 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -26,6 +26,7 @@
 
 
 class MatrixFactorizationModel(object):
+
     """A matrix factorisation model trained by regularized alternating
     least-squares.
 
@@ -58,6 +59,7 @@ def predictAll(self, usersProducts):
 
 
 class ALS(object):
+
     @classmethod
     def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
         sc = ratings.context
@@ -79,7 +81,8 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 041b119269427..c23ec7e44ca6d 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -27,6 +27,7 @@
 
 
 class LabeledPoint(object):
+
     """
     The features and labels of a data point.
 
@@ -34,6 +35,7 @@ class LabeledPoint(object):
     @param features: Vector of features for this point (NumPy array, list,
         pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix)
     """
+
     def __init__(self, label, features):
         self.label = label
         if (type(features) == ndarray or type(features) == SparseVector
@@ -42,14 +44,17 @@ def __init__(self, label, features):
         elif type(features) == list:
             self.features = array(features)
         else:
-            raise TypeError("Expected NumPy array, list, SparseVector, or scipy.sparse matrix")
+            raise TypeError(
+                "Expected NumPy array, list, SparseVector, or scipy.sparse matrix")
 
     def __str__(self):
         return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"
 
 
 class LinearModel(object):
+
     """A linear model that has a vector of coefficients and an intercept."""
+
     def __init__(self, weights, intercept):
         self._coeff = weights
         self._intercept = intercept
@@ -64,6 +69,7 @@ def intercept(self):
 
 
 class LinearRegressionModelBase(LinearModel):
+
     """A linear regression model.
 
     >>> lrmb = LinearRegressionModelBase(array([1.0, 2.0]), 0.1)
@@ -72,6 +78,7 @@ class LinearRegressionModelBase(LinearModel):
     >>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6
     True
     """
+
     def predict(self, x):
         """Predict the value of the dependent variable given a vector x"""
         """containing values for the independent variables."""
@@ -80,6 +87,7 @@ def predict(self, x):
 
 
 class LinearRegressionModel(LinearRegressionModelBase):
+
     """A linear regression model derived from a least-squares fit.
 
     >>> from pyspark.mllib.regression import LabeledPoint
@@ -111,6 +119,7 @@ class LinearRegressionModel(LinearRegressionModelBase):
 
 
 class LinearRegressionWithSGD(object):
+
     @classmethod
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=1.0, regType=None, intercept=False):
@@ -146,6 +155,7 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
 
 
 class LassoModel(LinearRegressionModelBase):
+
     """A linear regression model derived from a least-squares fit with an
     l_1 penalty term.
 
@@ -178,6 +188,7 @@ class LassoModel(LinearRegressionModelBase):
 
 
 class LassoWithSGD(object):
+
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=1.0,
               miniBatchFraction=1.0, initialWeights=None):
@@ -189,6 +200,7 @@ def train(cls, data, iterations=100, step=1.0, regParam=1.0,
 
 
 class RidgeRegressionModel(LinearRegressionModelBase):
+
     """A linear regression model derived from a least-squares fit with an
     l_2 penalty term.
 
@@ -221,6 +233,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
 
 
 class RidgeRegressionWithSGD(object):
+
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=1.0,
               miniBatchFraction=1.0, initialWeights=None):
@@ -235,7 +248,8 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index 0a08a562d1f1f..dbf2a03bb3a73 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -24,6 +24,7 @@
     _serialize_double, _serialize_double_vector, \
     _deserialize_double, _deserialize_double_matrix
 
+
 class Statistics(object):
 
     @staticmethod
@@ -79,13 +80,15 @@ def corr(x, y=None, method=None):
             try:
                 Xser = _get_unmangled_double_vector_rdd(x)
             except TypeError:
-                raise TypeError("corr called on a single RDD not consisted of Vectors.")
+                raise TypeError(
+                    "corr called on a single RDD not consisted of Vectors.")
             resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
             return _deserialize_double_matrix(resultMat)
         else:
             xSer = _get_unmangled_rdd(x, _serialize_double)
             ySer = _get_unmangled_rdd(y, _serialize_double)
-            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
+            result = sc._jvm.PythonMLLibAPI().corr(
+                xSer._jrdd, ySer._jrdd, method)
             return result
 
 
@@ -94,7 +97,8 @@ def _test():
     from pyspark import SparkContext
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 37ccf1d590743..d5dd07299269b 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -39,6 +39,7 @@
 
 
 class VectorTests(unittest.TestCase):
+
     def test_serialize(self):
         sv = SparseVector(4, {1: 1, 3: 2})
         dv = array([1., 2., 3., 4.])
@@ -46,9 +47,12 @@ def test_serialize(self):
         self.assertTrue(sv is _convert_vector(sv))
         self.assertTrue(dv is _convert_vector(dv))
         self.assertTrue(array_equal(dv, _convert_vector(lst)))
-        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(sv)))
-        self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
-        self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
+        self.assertEquals(
+            sv, _deserialize_double_vector(_serialize_double_vector(sv)))
+        self.assertTrue(
+            array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
+        self.assertTrue(
+            array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
 
     def test_dot(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -61,9 +65,11 @@ def test_dot(self):
         self.assertEquals(10.0, _dot(sv, dv))
         self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(sv, mat)))
         self.assertEquals(30.0, _dot(dv, dv))
-        self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(dv, mat)))
+        self.assertTrue(
+            array_equal(array([10., 20., 30., 40.]), _dot(dv, mat)))
         self.assertEquals(30.0, _dot(lst, dv))
-        self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(lst, mat)))
+        self.assertTrue(
+            array_equal(array([10., 20., 30., 40.]), _dot(lst, mat)))
 
     def test_squared_distance(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -81,6 +87,7 @@ def test_squared_distance(self):
 
 
 class ListTests(PySparkTestCase):
+
     """
     Test MLlib algorithms on plain lists, to make sure they're passed through
     as NumPy arrays.
@@ -94,7 +101,8 @@ def test_clustering(self):
             [1.1, 0],
             [1.2, 0],
         ]
-        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
+        clusters = KMeans.train(
+            self.sc.parallelize(data), 2, initializationMode="k-means||")
         self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
@@ -160,6 +168,7 @@ def test_regression(self):
 
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
+
     """
     Test both vector operations and MLlib algorithms with SciPy sparse matrices,
     if SciPy is available.
@@ -176,10 +185,14 @@ def test_serialize(self):
         self.assertEquals(sv, _convert_vector(lil.tocoo()))
         self.assertEquals(sv, _convert_vector(lil.tocsr()))
         self.assertEquals(sv, _convert_vector(lil.todok()))
-        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil)))
-        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
-        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
-        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
+        self.assertEquals(
+            sv, _deserialize_double_vector(_serialize_double_vector(lil)))
+        self.assertEquals(
+            sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
+        self.assertEquals(
+            sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
+        self.assertEquals(
+            sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
 
     def test_dot(self):
         from scipy.sparse import lil_matrix
@@ -223,7 +236,8 @@ def test_clustering(self):
             self.scipy_matrix(3, {2: 1.0}),
             self.scipy_matrix(3, {2: 1.1})
         ]
-        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
+        clusters = KMeans.train(
+            self.sc.parallelize(data), 2, initializationMode="k-means||")
         self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index d94900cefdb77..5dbab5102e5f8 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -25,6 +25,7 @@
 
 
 class MLUtils:
+
     """
     Helper methods to load, save and pre-process data used in MLlib.
     """
@@ -128,7 +129,8 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
-            numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+            numFeatures = parsed.map(
+                lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
@@ -183,7 +185,8 @@ def loadLabeledPoints(sc, path, minPartitions=None):
         (0.0,[1.01,2.02,3.03])
         """
         minPartitions = minPartitions or min(sc.defaultParallelism, 2)
-        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
+        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
+            sc._jsc, path, minPartitions)
         serialized = RDD(jSerialized, sc, NoOpSerializer())
         return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
 
@@ -195,7 +198,8 @@ def _test():
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 309f5a9b6038d..374535587cfde 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -233,7 +233,7 @@ def __init__(self, jrdd, ctx, jrdd_deserializer):
 
     def _toPickleSerialization(self):
         if (self._jrdd_deserializer == PickleSerializer() or
-            self._jrdd_deserializer == BatchedSerializer(PickleSerializer())):
+                self._jrdd_deserializer == BatchedSerializer(PickleSerializer())):
             return self
         else:
             return self._reserialize(BatchedSerializer(PickleSerializer(), 10))
@@ -1078,7 +1078,8 @@ def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueCl
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
         batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
-        self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd, batched, path,
+        self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(
+            pickledRDD._jrdd, batched, path,
             outputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf)
 
     def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
@@ -1124,7 +1125,8 @@ def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=No
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
         batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
-        self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickledRDD._jrdd, batched, path,
+        self.ctx._jvm.PythonRDD.saveAsHadoopFile(
+            pickledRDD._jrdd, batched, path,
             outputFormatClass, keyClass, valueClass, keyConverter, valueConverter,
             jconf, compressionCodecClass)
 
@@ -1348,7 +1350,7 @@ def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         outputSerializer = self.ctx._unbatched_serializer
 
         limit = (_parse_memory(self.ctx._conf.get(
-                    "spark.python.worker.memory", "512m")) / 2)
+            "spark.python.worker.memory", "512m")) / 2)
 
         def add_shuffle_key(split, iterator):
 
@@ -1430,12 +1432,12 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         spill = (self.ctx._conf.get("spark.shuffle.spill", 'True').lower()
                  == 'true')
         memory = _parse_memory(self.ctx._conf.get(
-                    "spark.python.worker.memory", "512m"))
+            "spark.python.worker.memory", "512m"))
         agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
 
         def combineLocally(iterator):
             merger = ExternalMerger(agg, memory * 0.9, serializer) \
-                         if spill else InMemoryMerger(agg)
+                if spill else InMemoryMerger(agg)
             merger.mergeValues(iterator)
             return merger.iteritems()
 
@@ -1444,7 +1446,7 @@ def combineLocally(iterator):
 
         def _mergeCombiners(iterator):
             merger = ExternalMerger(agg, memory, serializer) \
-                         if spill else InMemoryMerger(agg)
+                if spill else InMemoryMerger(agg)
             merger.mergeCombiners(iterator)
             return merger.iteritems()
 
@@ -1588,7 +1590,7 @@ def sampleByKey(self, withReplacement, fractions, seed=None):
         """
         for fraction in fractions.values():
             assert fraction >= 0.0, "Negative fraction value: %s" % fraction
-        return self.mapPartitionsWithIndex( \
+        return self.mapPartitionsWithIndex(
             RDDStratifiedSampler(withReplacement, fractions, seed).func, True)
 
     def subtractByKey(self, other, numPartitions=None):
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
index 2df000fdb08ca..35940191c70a3 100644
--- a/python/pyspark/rddsampler.py
+++ b/python/pyspark/rddsampler.py
@@ -20,6 +20,7 @@
 
 
 class RDDSamplerBase(object):
+
     def __init__(self, withReplacement, seed=None):
         try:
             import numpy
@@ -30,7 +31,8 @@ def __init__(self, withReplacement, seed=None):
                 "Falling back to default random generator for sampling.")
             self._use_numpy = False
 
-        self._seed = seed if seed is not None else random.randint(0, sys.maxint)
+        self._seed = seed if seed is not None else random.randint(
+            0, sys.maxint)
         self._withReplacement = withReplacement
         self._random = None
         self._split = None
@@ -85,7 +87,8 @@ def getPoissonSample(self, split, mean):
 
     def shuffle(self, vals):
         if self._random is None:
-            self.initRandomGenerator(0)  # this should only ever called on the master so
+            # this should only ever called on the master so
+            self.initRandomGenerator(0)
             # the split does not matter
 
         if self._use_numpy:
@@ -95,6 +98,7 @@ def shuffle(self, vals):
 
 
 class RDDSampler(RDDSamplerBase):
+
     def __init__(self, withReplacement, fraction, seed=None):
         RDDSamplerBase.__init__(self, withReplacement, seed)
         self._fraction = fraction
@@ -113,7 +117,9 @@ def func(self, split, iterator):
                 if self.getUniformSample(split) <= self._fraction:
                     yield obj
 
+
 class RDDStratifiedSampler(RDDSamplerBase):
+
     def __init__(self, withReplacement, fractions, seed=None):
         RDDSamplerBase.__init__(self, withReplacement, seed)
         self._fractions = fractions
diff --git a/python/pyspark/resultiterable.py b/python/pyspark/resultiterable.py
index df34740fc8176..ef04c82866e6c 100644
--- a/python/pyspark/resultiterable.py
+++ b/python/pyspark/resultiterable.py
@@ -21,9 +21,11 @@
 
 
 class ResultIterable(collections.Iterable):
+
     """
     A special result iterable. This is used because the standard iterator can not be pickled
     """
+
     def __init__(self, data):
         self.data = data
         self.index = 0
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 03b31ae9624c2..f6b8e2f7d5b82 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -108,6 +108,7 @@ def __ne__(self, other):
 
 
 class FramedSerializer(Serializer):
+
     """
     Serializer that writes objects as a stream of (length, data) pairs,
     where C{length} is a 32-bit integer and data is C{length} bytes.
@@ -159,6 +160,7 @@ def loads(self, obj):
 
 
 class BatchedSerializer(Serializer):
+
     """
     Serializes a stream of objects in batches by calling its wrapped
     Serializer with streams of objects.
@@ -204,6 +206,7 @@ def __str__(self):
 
 
 class CartesianDeserializer(FramedSerializer):
+
     """
     Deserializes the JavaRDD cartesian() of two PythonRDDs.
     """
@@ -237,6 +240,7 @@ def __str__(self):
 
 
 class PairDeserializer(CartesianDeserializer):
+
     """
     Deserializes the JavaRDD zip() of two PythonRDDs.
     """
@@ -268,6 +272,7 @@ def dumps(self, obj):
 
 
 class PickleSerializer(FramedSerializer):
+
     """
     Serializes objects using Python's cPickle serializer:
 
@@ -290,6 +295,7 @@ def dumps(self, obj):
 
 
 class MarshalSerializer(FramedSerializer):
+
     """
     Serializes objects using Python's Marshal serializer:
 
@@ -303,9 +309,11 @@ class MarshalSerializer(FramedSerializer):
 
 
 class AutoSerializer(FramedSerializer):
+
     """
     Choose marshal or cPickle as serialization protocol autumatically
     """
+
     def __init__(self):
         FramedSerializer.__init__(self)
         self._type = None
@@ -330,6 +338,7 @@ def loads(self, obj):
 
 
 class UTF8Deserializer(Serializer):
+
     """
     Deserializes streams written by String.getBytes.
     """
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index e1e7cd954189f..be22020fb827b 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -39,7 +39,8 @@
              if os.environ.get("ADD_FILES") is not None else None)
 
 if os.environ.get("SPARK_EXECUTOR_URI"):
-    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
+    SparkContext.setSystemProperty(
+        "spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
 
 sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
 
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index e3923d1c36c57..2c68cd4921deb 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -45,7 +45,7 @@ def get_used_memory():
                     return int(line.split()[1]) >> 10
         else:
             warnings.warn("Please install psutil to have better "
-                    "support with spilling")
+                          "support with spilling")
             if platform.system() == "Darwin":
                 import resource
                 rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
@@ -141,7 +141,7 @@ class ExternalMerger(Merger):
 
     This class works as follows:
 
-    - It repeatedly combine the items and save them in one dict in 
+    - It repeatedly combine the items and save them in one dict in
       memory.
 
     - When the used memory goes above memory limit, it will split
@@ -190,12 +190,12 @@ class ExternalMerger(Merger):
     MAX_TOTAL_PARTITIONS = 4096
 
     def __init__(self, aggregator, memory_limit=512, serializer=None,
-            localdirs=None, scale=1, partitions=59, batch=1000):
+                 localdirs=None, scale=1, partitions=59, batch=1000):
         Merger.__init__(self, aggregator)
         self.memory_limit = memory_limit
         # default serializer is only used for tests
         self.serializer = serializer or \
-                BatchedSerializer(PickleSerializer(), 1024)
+            BatchedSerializer(PickleSerializer(), 1024)
         self.localdirs = localdirs or self._get_dirs()
         # number of partitions when spill data into disks
         self.partitions = partitions
@@ -341,7 +341,7 @@ def _spill(self):
                 self.pdata[i].clear()
 
         self.spills += 1
-        gc.collect() # release the memory as much as possible
+        gc.collect()  # release the memory as much as possible
 
     def iteritems(self):
         """ Return all merged items as iterator """
@@ -370,8 +370,8 @@ def _external_items(self):
                     if (self.scale * self.partitions < self.MAX_TOTAL_PARTITIONS
                             and j < self.spills - 1
                             and get_used_memory() > hard_limit):
-                        self.data.clear() # will read from disk again
-                        gc.collect() # release the memory as much as possible
+                        self.data.clear()  # will read from disk again
+                        gc.collect()  # release the memory as much as possible
                         for v in self._recursive_merged_items(i):
                             yield v
                         return
@@ -409,9 +409,9 @@ def _recursive_merged_items(self, start):
 
         for i in range(start, self.partitions):
             subdirs = [os.path.join(d, "parts", str(i))
-                            for d in self.localdirs]
+                       for d in self.localdirs]
             m = ExternalMerger(self.agg, self.memory_limit, self.serializer,
-                    subdirs, self.scale * self.partitions)
+                               subdirs, self.scale * self.partitions)
             m.pdata = [{} for _ in range(self.partitions)]
             limit = self._next_limit()
 
@@ -419,7 +419,7 @@ def _recursive_merged_items(self, start):
                 path = self._get_spill_dir(j)
                 p = os.path.join(path, str(i))
                 m._partitioned_mergeCombiners(
-                        self.serializer.load_stream(open(p)))
+                    self.serializer.load_stream(open(p)))
 
                 if get_used_memory() > limit:
                     m._spill()
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index f840475ffaf70..4a4b5c8a476fb 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -41,6 +41,7 @@
 
 
 class DataType(object):
+
     """Spark SQL DataType"""
 
     def __repr__(self):
@@ -58,6 +59,7 @@ def __ne__(self, other):
 
 
 class PrimitiveTypeSingleton(type):
+
     """Metaclass for PrimitiveType"""
 
     _instances = {}
@@ -69,6 +71,7 @@ def __call__(cls):
 
 
 class PrimitiveType(DataType):
+
     """Spark SQL PrimitiveType"""
 
     __metaclass__ = PrimitiveTypeSingleton
@@ -79,6 +82,7 @@ def __eq__(self, other):
 
 
 class StringType(PrimitiveType):
+
     """Spark SQL StringType
 
     The data type representing string values.
@@ -86,6 +90,7 @@ class StringType(PrimitiveType):
 
 
 class BinaryType(PrimitiveType):
+
     """Spark SQL BinaryType
 
     The data type representing bytearray values.
@@ -93,6 +98,7 @@ class BinaryType(PrimitiveType):
 
 
 class BooleanType(PrimitiveType):
+
     """Spark SQL BooleanType
 
     The data type representing bool values.
@@ -100,6 +106,7 @@ class BooleanType(PrimitiveType):
 
 
 class TimestampType(PrimitiveType):
+
     """Spark SQL TimestampType
 
     The data type representing datetime.datetime values.
@@ -107,6 +114,7 @@ class TimestampType(PrimitiveType):
 
 
 class DecimalType(PrimitiveType):
+
     """Spark SQL DecimalType
 
     The data type representing decimal.Decimal values.
@@ -114,6 +122,7 @@ class DecimalType(PrimitiveType):
 
 
 class DoubleType(PrimitiveType):
+
     """Spark SQL DoubleType
 
     The data type representing float values.
@@ -121,6 +130,7 @@ class DoubleType(PrimitiveType):
 
 
 class FloatType(PrimitiveType):
+
     """Spark SQL FloatType
 
     The data type representing single precision floating-point values.
@@ -128,6 +138,7 @@ class FloatType(PrimitiveType):
 
 
 class ByteType(PrimitiveType):
+
     """Spark SQL ByteType
 
     The data type representing int values with 1 singed byte.
@@ -135,6 +146,7 @@ class ByteType(PrimitiveType):
 
 
 class IntegerType(PrimitiveType):
+
     """Spark SQL IntegerType
 
     The data type representing int values.
@@ -142,6 +154,7 @@ class IntegerType(PrimitiveType):
 
 
 class LongType(PrimitiveType):
+
     """Spark SQL LongType
 
     The data type representing long values. If the any value is
@@ -151,6 +164,7 @@ class LongType(PrimitiveType):
 
 
 class ShortType(PrimitiveType):
+
     """Spark SQL ShortType
 
     The data type representing int values with 2 signed bytes.
@@ -158,6 +172,7 @@ class ShortType(PrimitiveType):
 
 
 class ArrayType(DataType):
+
     """Spark SQL ArrayType
 
     The data type representing list values. An ArrayType object
@@ -183,10 +198,11 @@ def __init__(self, elementType, containsNull=False):
 
     def __str__(self):
         return "ArrayType(%s,%s)" % (self.elementType,
-               str(self.containsNull).lower())
+                                     str(self.containsNull).lower())
 
 
 class MapType(DataType):
+
     """Spark SQL MapType
 
     The data type representing dict values. A MapType object comprises
@@ -222,10 +238,11 @@ def __init__(self, keyType, valueType, valueContainsNull=True):
 
     def __repr__(self):
         return "MapType(%s,%s,%s)" % (self.keyType, self.valueType,
-               str(self.valueContainsNull).lower())
+                                      str(self.valueContainsNull).lower())
 
 
 class StructField(DataType):
+
     """Spark SQL StructField
 
     Represents a field in a StructType.
@@ -259,10 +276,11 @@ def __init__(self, name, dataType, nullable):
 
     def __repr__(self):
         return "StructField(%s,%s,%s)" % (self.name, self.dataType,
-               str(self.nullable).lower())
+                                          str(self.nullable).lower())
 
 
 class StructType(DataType):
+
     """Spark SQL StructType
 
     The data type representing rows.
@@ -287,7 +305,7 @@ def __init__(self, fields):
 
     def __repr__(self):
         return ("StructType(List(%s))" %
-                    ",".join(str(field) for field in self.fields))
+                ",".join(str(field) for field in self.fields))
 
 
 def _parse_datatype_list(datatype_list_string):
@@ -315,7 +333,7 @@ def _parse_datatype_list(datatype_list_string):
 
 
 _all_primitive_types = dict((k, v) for k, v in globals().iteritems()
-    if type(v) is PrimitiveTypeSingleton and v.__base__ == PrimitiveType)
+                            if type(v) is PrimitiveTypeSingleton and v.__base__ == PrimitiveType)
 
 
 def _parse_datatype_string(datatype_string):
@@ -455,16 +473,16 @@ def _infer_schema(row):
         items = sorted(row.items())
 
     elif isinstance(row, tuple):
-        if hasattr(row, "_fields"): # namedtuple
+        if hasattr(row, "_fields"):  # namedtuple
             items = zip(row._fields, tuple(row))
-        elif hasattr(row, "__FIELDS__"): # Row
+        elif hasattr(row, "__FIELDS__"):  # Row
             items = zip(row.__FIELDS__, tuple(row))
         elif all(isinstance(x, tuple) and len(x) == 2 for x in row):
             items = row
         else:
             raise ValueError("Can't infer schema from tuple")
 
-    elif hasattr(row, "__dict__"): # object
+    elif hasattr(row, "__dict__"):  # object
         items = sorted(row.__dict__.items())
 
     else:
@@ -495,7 +513,7 @@ def _create_converter(obj, dataType):
         conv = lambda o: tuple(o.get(n) for n in names)
 
     elif isinstance(obj, tuple):
-        if hasattr(obj, "_fields"): # namedtuple
+        if hasattr(obj, "_fields"):  # namedtuple
             conv = tuple
         elif hasattr(obj, "__FIELDS__"):
             conv = tuple
@@ -504,7 +522,7 @@ def _create_converter(obj, dataType):
         else:
             raise ValueError("unexpected tuple")
 
-    elif hasattr(obj, "__dict__"): # object
+    elif hasattr(obj, "__dict__"):  # object
         conv = lambda o: [o.__dict__.get(n, None) for n in names]
 
     nested = any(_has_struct(f.dataType) for f in dataType.fields)
@@ -656,7 +674,7 @@ def _infer_schema_type(obj, dataType):
         assert len(fs) == len(obj), \
             "Obj(%s) have different length with fields(%s)" % (obj, fs)
         fields = [StructField(f.name, _infer_schema_type(o, f.dataType), True)
-                    for o, f in zip(obj, fs)]
+                  for o, f in zip(obj, fs)]
         return StructType(fields)
 
     else:
@@ -679,6 +697,7 @@ def _infer_schema_type(obj, dataType):
     StructType: (tuple, list),
 }
 
+
 def _verify_type(obj, dataType):
     """
     Verify the type of obj against dataType, raise an exception if
@@ -724,7 +743,7 @@ def _verify_type(obj, dataType):
     elif isinstance(dataType, StructType):
         if len(obj) != len(dataType.fields):
             raise ValueError("Length of object (%d) does not match with"
-                "length of fields (%d)" % (len(obj), len(dataType.fields)))
+                             "length of fields (%d)" % (len(obj), len(dataType.fields)))
         for v, f in zip(obj, dataType.fields):
             _verify_type(v, f.dataType)
 
@@ -857,6 +876,7 @@ def __reduce__(self):
         raise Exception("unexpected data type: %s" % dataType)
 
     class Row(tuple):
+
         """ Row in SchemaRDD """
         __DATATYPE__ = dataType
         __FIELDS__ = tuple(f.name for f in dataType.fields)
@@ -868,7 +888,7 @@ class Row(tuple):
         def __repr__(self):
             # call collect __repr__ for nested objects
             return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
-                    for n in self.__FIELDS__))
+                                          for n in self.__FIELDS__))
 
         def __reduce__(self):
             return (_restore_object, (self.__DATATYPE__, tuple(self)))
@@ -877,6 +897,7 @@ def __reduce__(self):
 
 
 class SQLContext:
+
     """Main entry point for SparkSQL functionality.
 
     A SQLContext can be used create L{SchemaRDD}s, register L{SchemaRDD}s as
@@ -975,7 +996,7 @@ def inferSchema(self, rdd):
         first = rdd.first()
         if not first:
             raise ValueError("The first row in RDD is empty, "
-                    "can not infer schema")
+                             "can not infer schema")
         if type(first) is dict:
             warnings.warn("Using RDD of dict to inferSchema is deprecated")
 
@@ -1232,6 +1253,7 @@ def uncacheTable(self, tableName):
 
 
 class HiveContext(SQLContext):
+
     """A variant of Spark SQL that integrates with data stored in Hive.
 
     Configuration for Hive is read from hive-site.xml on the classpath.
@@ -1268,6 +1290,7 @@ def hql(self, hqlQuery):
 
 
 class LocalHiveContext(HiveContext):
+
     """Starts up an instance of hive where metadata is stored locally.
 
     An in-process metadata data is created with data stored in ./metadata.
@@ -1298,7 +1321,7 @@ class LocalHiveContext(HiveContext):
     def __init__(self, sparkContext, sqlContext=None):
         HiveContext.__init__(self, sparkContext, sqlContext)
         warnings.warn("LocalHiveContext is deprecated. "
-                "Use HiveContext instead.", DeprecationWarning)
+                      "Use HiveContext instead.", DeprecationWarning)
 
     def _get_hive_ctx(self):
         return self._jvm.LocalHiveContext(self._jsc.sc())
@@ -1317,6 +1340,7 @@ def _create_row(fields, values):
 
 
 class Row(tuple):
+
     """
     A row in L{SchemaRDD}. The fields in it can be accessed like attributes.
 
@@ -1358,7 +1382,6 @@ def __new__(self, *args, **kwargs):
         else:
             raise ValueError("No args or kwargs")
 
-
     # let obect acs like class
     def __call__(self, *args):
         """create new Row object"""
@@ -1384,12 +1407,13 @@ def __reduce__(self):
     def __repr__(self):
         if hasattr(self, "__FIELDS__"):
             return "Row(%s)" % ", ".join("%s=%r" % (k, v)
-                for k, v in zip(self.__FIELDS__, self))
+                                         for k, v in zip(self.__FIELDS__, self))
         else:
             return "<Row(%s)>" % ", ".join(self)
 
 
 class SchemaRDD(RDD):
+
     """An RDD of L{Row} objects that has an associated schema.
 
     The underlying JVM object is a SchemaRDD, not a PythonRDD, so we can
@@ -1596,7 +1620,7 @@ def subtract(self, other, numPartitions=None):
                 rdd = self._jschema_rdd.subtract(other._jschema_rdd)
             else:
                 rdd = self._jschema_rdd.subtract(other._jschema_rdd,
-                        numPartitions)
+                                                 numPartitions)
             return SchemaRDD(rdd, self.sql_ctx)
         else:
             raise ValueError("Can only subtract another SchemaRDD")
@@ -1623,9 +1647,9 @@ def _test():
     jsonStrings = [
         '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
         '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
-            '"field6":[{"field7": "row2"}]}',
+        '"field6":[{"field7": "row2"}]}',
         '{"field1" : null, "field2": "row3", '
-            '"field3":{"field4":33, "field5": []}}'
+        '"field3":{"field4":33, "field5": []}}'
     ]
     globs['jsonStrings'] = jsonStrings
     globs['json'] = sc.parallelize(jsonStrings)
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 1e597d64e03fe..8b59d62bb40c3 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -51,13 +51,15 @@ def merge(self, value):
 
         return self
 
-    # Merge another StatCounter into this one, adding up the internal statistics.
+    # Merge another StatCounter into this one, adding up the internal
+    # statistics.
     def mergeStats(self, other):
         if not isinstance(other, StatCounter):
             raise Exception("Can only merge Statcounters!")
 
         if other is self:  # reference equality holds
-            self.merge(copy.deepcopy(other))  # Avoid overwriting fields in a weird order
+            # Avoid overwriting fields in a weird order
+            self.merge(copy.deepcopy(other))
         else:
             if self.n == 0:
                 self.mu = other.mu
@@ -73,12 +75,14 @@ def mergeStats(self, other):
                 elif self.n * 10 < other.n:
                     self.mu = other.mu - (delta * self.n) / (self.n + other.n)
                 else:
-                    self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
+                    self.mu = (
+                        self.mu * self.n + other.mu * other.n) / (self.n + other.n)
 
                 self.maxValue = maximum(self.maxValue, other.maxValue)
                 self.minValue = minimum(self.minValue, other.minValue)
 
-                self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
+                self.m2 += other.m2 + \
+                    (delta * delta * self.n * other.n) / (self.n + other.n)
                 self.n += other.n
         return self
 
diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py
index 5d77a131f2856..2aa0fb9d2c1ed 100644
--- a/python/pyspark/storagelevel.py
+++ b/python/pyspark/storagelevel.py
@@ -19,6 +19,7 @@
 
 
 class StorageLevel:
+
     """
     Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
     whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 16fb5a9256220..e5e455ebbe785 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -62,53 +62,53 @@ def setUp(self):
         self.N = 1 << 16
         self.l = [i for i in xrange(self.N)]
         self.data = zip(self.l, self.l)
-        self.agg = Aggregator(lambda x: [x], 
-                lambda x, y: x.append(y) or x,
-                lambda x, y: x.extend(y) or x)
+        self.agg = Aggregator(lambda x: [x],
+                              lambda x, y: x.append(y) or x,
+                              lambda x, y: x.extend(y) or x)
 
     def test_in_memory(self):
         m = InMemoryMerger(self.agg)
         m.mergeValues(self.data)
         self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
-                sum(xrange(self.N)))
+                         sum(xrange(self.N)))
 
         m = InMemoryMerger(self.agg)
         m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data))
         self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
-                sum(xrange(self.N)))
+                         sum(xrange(self.N)))
 
     def test_small_dataset(self):
         m = ExternalMerger(self.agg, 1000)
         m.mergeValues(self.data)
         self.assertEqual(m.spills, 0)
         self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
-                sum(xrange(self.N)))
+                         sum(xrange(self.N)))
 
         m = ExternalMerger(self.agg, 1000)
         m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data))
         self.assertEqual(m.spills, 0)
         self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
-                sum(xrange(self.N)))
+                         sum(xrange(self.N)))
 
     def test_medium_dataset(self):
         m = ExternalMerger(self.agg, 10)
         m.mergeValues(self.data)
         self.assertTrue(m.spills >= 1)
         self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
-                sum(xrange(self.N)))
+                         sum(xrange(self.N)))
 
         m = ExternalMerger(self.agg, 10)
         m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data * 3))
         self.assertTrue(m.spills >= 1)
         self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
-                sum(xrange(self.N)) * 3)
+                         sum(xrange(self.N)) * 3)
 
     def test_huge_dataset(self):
         m = ExternalMerger(self.agg, 10)
         m.mergeCombiners(map(lambda (k, v): (k, [str(v)]), self.data * 10))
         self.assertTrue(m.spills >= 1)
         self.assertEqual(sum(len(v) for k, v in m._recursive_merged_items(0)),
-                self.N * 10)
+                         self.N * 10)
         m._cleanup()
 
 
@@ -177,6 +177,7 @@ def test_add_py_file(self):
         log4j = self.sc._jvm.org.apache.log4j
         old_level = log4j.LogManager.getRootLogger().getLevel()
         log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL)
+
         def func(x):
             from userlibrary import UserClass
             return UserClass().hello()
@@ -215,10 +216,12 @@ def test_add_egg_file_locally(self):
         def func():
             from userlib import UserClass
         self.assertRaises(ImportError, func)
-        path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1-py2.7.egg")
+        path = os.path.join(
+            SPARK_HOME, "python/test_support/userlib-0.1-py2.7.egg")
         self.sc.addPyFile(path)
         from userlib import UserClass
-        self.assertEqual("Hello World from inside a package!", UserClass().hello())
+        self.assertEqual(
+            "Hello World from inside a package!", UserClass().hello())
 
 
 class TestRDDFunctions(PySparkTestCase):
@@ -226,7 +229,8 @@ class TestRDDFunctions(PySparkTestCase):
     def test_failed_sparkcontext_creation(self):
         # Regression test for SPARK-1550
         self.sc.stop()
-        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
+        self.assertRaises(
+            Exception, lambda: SparkContext("an-invalid-master-name"))
         self.sc = SparkContext("local")
 
     def test_save_as_textfile_with_unicode(self):
@@ -315,7 +319,8 @@ def setUp(self):
         PySparkTestCase.setUp(self)
         self.tempdir = tempfile.NamedTemporaryFile(delete=False)
         os.unlink(self.tempdir.name)
-        self.sc._jvm.WriteInputFormatTestDataGenerator.generateData(self.tempdir.name, self.sc._jsc)
+        self.sc._jvm.WriteInputFormatTestDataGenerator.generateData(
+            self.tempdir.name, self.sc._jsc)
 
     def tearDown(self):
         PySparkTestCase.tearDown(self)
@@ -326,18 +331,20 @@ def test_sequencefiles(self):
         ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/",
                                            "org.apache.hadoop.io.IntWritable",
                                            "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
+              (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.assertEqual(ints, ei)
 
         doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/",
                                               "org.apache.hadoop.io.DoubleWritable",
                                               "org.apache.hadoop.io.Text").collect())
-        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
+        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'),
+              (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
         self.assertEqual(doubles, ed)
 
         bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/",
-                                              "org.apache.hadoop.io.IntWritable",
-                                              "org.apache.hadoop.io.BytesWritable").collect())
+                                            "org.apache.hadoop.io.IntWritable",
+                                            "org.apache.hadoop.io.BytesWritable").collect())
         ebs = [(1, bytearray('aa', 'utf-8')),
                (1, bytearray('aa', 'utf-8')),
                (2, bytearray('aa', 'utf-8')),
@@ -360,7 +367,8 @@ def test_sequencefiles(self):
         bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/",
                                             "org.apache.hadoop.io.IntWritable",
                                             "org.apache.hadoop.io.BooleanWritable").collect())
-        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
+        eb = [(1, False), (1, True), (2, False),
+              (2, False), (2, True), (3, True)]
         self.assertEqual(bools, eb)
 
         nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/",
@@ -409,9 +417,9 @@ def test_sequencefiles(self):
         self.assertEqual(clazz[0], ec)
 
         unbatched_clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/",
-                                            "org.apache.hadoop.io.Text",
-                                            "org.apache.spark.api.python.TestWritable",
-                                            batchSize=1).collect())
+                                                      "org.apache.hadoop.io.Text",
+                                                      "org.apache.spark.api.python.TestWritable",
+                                                      batchSize=1).collect())
         self.assertEqual(unbatched_clazz[0], ec)
 
     def test_oldhadoop(self):
@@ -420,11 +428,12 @@ def test_oldhadoop(self):
                                          "org.apache.hadoop.mapred.SequenceFileInputFormat",
                                          "org.apache.hadoop.io.IntWritable",
                                          "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
+              (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.assertEqual(ints, ei)
 
         hellopath = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
-        oldconf = {"mapred.input.dir" : hellopath}
+        oldconf = {"mapred.input.dir": hellopath}
         hello = self.sc.hadoopRDD("org.apache.hadoop.mapred.TextInputFormat",
                                   "org.apache.hadoop.io.LongWritable",
                                   "org.apache.hadoop.io.Text",
@@ -439,11 +448,12 @@ def test_newhadoop(self):
             "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
             "org.apache.hadoop.io.IntWritable",
             "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
+              (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.assertEqual(ints, ei)
 
         hellopath = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
-        newconf = {"mapred.input.dir" : hellopath}
+        newconf = {"mapred.input.dir": hellopath}
         hello = self.sc.newAPIHadoopRDD("org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                                         "org.apache.hadoop.io.LongWritable",
                                         "org.apache.hadoop.io.Text",
@@ -498,6 +508,7 @@ def test_converters(self):
               (u'\x03', [2.0])]
         self.assertEqual(maps, em)
 
+
 class TestOutputFormat(PySparkTestCase):
 
     def setUp(self):
@@ -511,17 +522,21 @@ def tearDown(self):
 
     def test_sequencefiles(self):
         basepath = self.tempdir.name
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
+              (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/")
         ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect())
         self.assertEqual(ints, ei)
 
-        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
+        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'),
+              (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
         self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/")
-        doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect())
+        doubles = sorted(
+            self.sc.sequenceFile(basepath + "/sfdouble/").collect())
         self.assertEqual(doubles, ed)
 
-        ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))]
+        ebs = [(1, bytearray(b'\x00\x07spam\x08')),
+               (2, bytearray(b'\x00\x07spam\x08'))]
         self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/")
         bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect())
         self.assertEqual(bytes, ebs)
@@ -533,7 +548,8 @@ def test_sequencefiles(self):
         text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect())
         self.assertEqual(text, et)
 
-        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
+        eb = [(1, False), (1, True), (2, False),
+              (2, False), (2, True), (3, True)]
         self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/")
         bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect())
         self.assertEqual(bools, eb)
@@ -555,8 +571,8 @@ def test_sequencefiles(self):
     def test_oldhadoop(self):
         basepath = self.tempdir.name
         dict_data = [(1, {}),
-                     (1, {"row1" : 1.0}),
-                     (2, {"row2" : 2.0})]
+                     (1, {"row1": 1.0}),
+                     (2, {"row2": 2.0})]
         self.sc.parallelize(dict_data).saveAsHadoopFile(
             basepath + "/oldhadoop/",
             "org.apache.hadoop.mapred.SequenceFileOutputFormat",
@@ -570,12 +586,12 @@ def test_oldhadoop(self):
         self.assertEqual(result, dict_data)
 
         conf = {
-            "mapred.output.format.class" : "org.apache.hadoop.mapred.SequenceFileOutputFormat",
-            "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
-            "mapred.output.value.class" : "org.apache.hadoop.io.MapWritable",
-            "mapred.output.dir" : basepath + "/olddataset/"}
+            "mapred.output.format.class": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
+            "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
+            "mapred.output.value.class": "org.apache.hadoop.io.MapWritable",
+            "mapred.output.dir": basepath + "/olddataset/"}
         self.sc.parallelize(dict_data).saveAsHadoopDataset(conf)
-        input_conf = {"mapred.input.dir" : basepath + "/olddataset/"}
+        input_conf = {"mapred.input.dir": basepath + "/olddataset/"}
         old_dataset = sorted(self.sc.hadoopRDD(
             "org.apache.hadoop.mapred.SequenceFileInputFormat",
             "org.apache.hadoop.io.IntWritable",
@@ -603,14 +619,15 @@ def test_newhadoop(self):
             valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
         self.assertEqual(result, array_data)
 
-        conf = {"mapreduce.outputformat.class" :
-                     "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
-                 "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
-                 "mapred.output.value.class" : "org.apache.spark.api.python.DoubleArrayWritable",
-                 "mapred.output.dir" : basepath + "/newdataset/"}
-        self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset(conf,
+        conf = {"mapreduce.outputformat.class":
+                "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
+                "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
+                "mapred.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable",
+                "mapred.output.dir": basepath + "/newdataset/"}
+        self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset(
+            conf,
             valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
-        input_conf = {"mapred.input.dir" : basepath + "/newdataset/"}
+        input_conf = {"mapred.input.dir": basepath + "/newdataset/"}
         new_dataset = sorted(self.sc.newAPIHadoopRDD(
             "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
             "org.apache.hadoop.io.IntWritable",
@@ -621,7 +638,7 @@ def test_newhadoop(self):
 
     def test_newolderror(self):
         basepath = self.tempdir.name
-        rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x ))
+        rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
         self.assertRaises(Exception, lambda: rdd.saveAsHadoopFile(
             basepath + "/newolderror/saveAsHadoopFile/",
             "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"))
@@ -631,7 +648,7 @@ def test_newolderror(self):
 
     def test_bad_inputs(self):
         basepath = self.tempdir.name
-        rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x ))
+        rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))
         self.assertRaises(Exception, lambda: rdd.saveAsHadoopFile(
             basepath + "/badinputs/saveAsHadoopFile/",
             "org.apache.hadoop.mapred.NotValidOutputFormat"))
@@ -650,7 +667,8 @@ def test_converters(self):
             "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
             keyConverter="org.apache.spark.api.python.TestOutputKeyConverter",
             valueConverter="org.apache.spark.api.python.TestOutputValueConverter")
-        converted = sorted(self.sc.sequenceFile(basepath + "/converters/").collect())
+        converted = sorted(
+            self.sc.sequenceFile(basepath + "/converters/").collect())
         expected = [(u'1', 3.0),
                     (u'2', 1.0),
                     (u'3', 2.0)]
@@ -663,62 +681,72 @@ def test_reserialization(self):
         data = zip(x, y)
         rdd = self.sc.parallelize(x).zip(self.sc.parallelize(y))
         rdd.saveAsSequenceFile(basepath + "/reserialize/sequence")
-        result1 = sorted(self.sc.sequenceFile(basepath + "/reserialize/sequence").collect())
+        result1 = sorted(
+            self.sc.sequenceFile(basepath + "/reserialize/sequence").collect())
         self.assertEqual(result1, data)
 
         rdd.saveAsHadoopFile(basepath + "/reserialize/hadoop",
                              "org.apache.hadoop.mapred.SequenceFileOutputFormat")
-        result2 = sorted(self.sc.sequenceFile(basepath + "/reserialize/hadoop").collect())
+        result2 = sorted(
+            self.sc.sequenceFile(basepath + "/reserialize/hadoop").collect())
         self.assertEqual(result2, data)
 
-        rdd.saveAsNewAPIHadoopFile(basepath + "/reserialize/newhadoop",
-                             "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
-        result3 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newhadoop").collect())
+        rdd.saveAsNewAPIHadoopFile(
+            basepath + "/reserialize/newhadoop",
+            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
+        result3 = sorted(
+            self.sc.sequenceFile(basepath + "/reserialize/newhadoop").collect())
         self.assertEqual(result3, data)
 
         conf4 = {
-            "mapred.output.format.class" : "org.apache.hadoop.mapred.SequenceFileOutputFormat",
-            "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
-            "mapred.output.value.class" : "org.apache.hadoop.io.IntWritable",
-            "mapred.output.dir" : basepath + "/reserialize/dataset"}
+            "mapred.output.format.class": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
+            "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
+            "mapred.output.value.class": "org.apache.hadoop.io.IntWritable",
+            "mapred.output.dir": basepath + "/reserialize/dataset"}
         rdd.saveAsHadoopDataset(conf4)
-        result4 = sorted(self.sc.sequenceFile(basepath + "/reserialize/dataset").collect())
+        result4 = sorted(
+            self.sc.sequenceFile(basepath + "/reserialize/dataset").collect())
         self.assertEqual(result4, data)
 
-        conf5 = {"mapreduce.outputformat.class" :
-                     "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
-            "mapred.output.key.class" : "org.apache.hadoop.io.IntWritable",
-            "mapred.output.value.class" : "org.apache.hadoop.io.IntWritable",
-            "mapred.output.dir" : basepath + "/reserialize/newdataset"}
+        conf5 = {"mapreduce.outputformat.class":
+                 "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
+                 "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
+                 "mapred.output.value.class": "org.apache.hadoop.io.IntWritable",
+                 "mapred.output.dir": basepath + "/reserialize/newdataset"}
         rdd.saveAsNewAPIHadoopDataset(conf5)
-        result5 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newdataset").collect())
+        result5 = sorted(
+            self.sc.sequenceFile(basepath + "/reserialize/newdataset").collect())
         self.assertEqual(result5, data)
 
     def test_unbatched_save_and_read(self):
         basepath = self.tempdir.name
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
+              (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.sc.parallelize(ei, numSlices=len(ei)).saveAsSequenceFile(
             basepath + "/unbatched/")
 
         unbatched_sequence = sorted(self.sc.sequenceFile(basepath + "/unbatched/",
-            batchSize=1).collect())
+                                                         batchSize=1).collect())
         self.assertEqual(unbatched_sequence, ei)
 
-        unbatched_hadoopFile = sorted(self.sc.hadoopFile(basepath + "/unbatched/",
-            "org.apache.hadoop.mapred.SequenceFileInputFormat",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.hadoop.io.Text",
-            batchSize=1).collect())
+        unbatched_hadoopFile = sorted(
+            self.sc.hadoopFile(basepath + "/unbatched/",
+                               "org.apache.hadoop.mapred.SequenceFileInputFormat",
+                               "org.apache.hadoop.io.IntWritable",
+                               "org.apache.hadoop.io.Text",
+                               batchSize=1).collect())
         self.assertEqual(unbatched_hadoopFile, ei)
 
-        unbatched_newAPIHadoopFile = sorted(self.sc.newAPIHadoopFile(basepath + "/unbatched/",
-            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
-            "org.apache.hadoop.io.IntWritable",
-            "org.apache.hadoop.io.Text",
-            batchSize=1).collect())
+        unbatched_newAPIHadoopFile = sorted(
+            self.sc.newAPIHadoopFile(
+                basepath + "/unbatched/",
+                "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
+                "org.apache.hadoop.io.IntWritable",
+                "org.apache.hadoop.io.Text",
+                batchSize=1).collect())
         self.assertEqual(unbatched_newAPIHadoopFile, ei)
 
-        oldconf = {"mapred.input.dir" : basepath + "/unbatched/"}
+        oldconf = {"mapred.input.dir": basepath + "/unbatched/"}
         unbatched_hadoopRDD = sorted(self.sc.hadoopRDD(
             "org.apache.hadoop.mapred.SequenceFileInputFormat",
             "org.apache.hadoop.io.IntWritable",
@@ -727,7 +755,7 @@ def test_unbatched_save_and_read(self):
             batchSize=1).collect())
         self.assertEqual(unbatched_hadoopRDD, ei)
 
-        newconf = {"mapred.input.dir" : basepath + "/unbatched/"}
+        newconf = {"mapred.input.dir": basepath + "/unbatched/"}
         unbatched_newAPIHadoopRDD = sorted(self.sc.newAPIHadoopRDD(
             "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
             "org.apache.hadoop.io.IntWritable",
@@ -744,7 +772,9 @@ def test_malformed_RDD(self):
         self.assertRaises(Exception, lambda: rdd.saveAsSequenceFile(
             basepath + "/malformed/sequence"))
 
+
 class TestDaemon(unittest.TestCase):
+
     def connect(self, port):
         from socket import socket, AF_INET, SOCK_STREAM
         sock = socket(AF_INET, SOCK_STREAM)
@@ -791,9 +821,11 @@ def test_termination_sigterm(self):
 
 
 class TestSparkSubmit(unittest.TestCase):
+
     def setUp(self):
         self.programDir = tempfile.mkdtemp()
-        self.sparkSubmit = os.path.join(os.environ.get("SPARK_HOME"), "bin", "spark-submit")
+        self.sparkSubmit = os.path.join(
+            os.environ.get("SPARK_HOME"), "bin", "spark-submit")
 
     def tearDown(self):
         shutil.rmtree(self.programDir)
@@ -830,7 +862,8 @@ def test_single_script(self):
             |sc = SparkContext()
             |print sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect()
             """)
-        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
+        proc = subprocess.Popen(
+            [self.sparkSubmit, script], stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 4, 6]", out)
@@ -846,7 +879,8 @@ def test_script_with_local_functions(self):
             |sc = SparkContext()
             |print sc.parallelize([1, 2, 3]).map(foo).collect()
             """)
-        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
+        proc = subprocess.Popen(
+            [self.sparkSubmit, script], stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[3, 6, 9]", out)
@@ -884,7 +918,8 @@ def test_module_dependency_on_cluster(self):
             |    return x + 1
             """)
         proc = subprocess.Popen(
-            [self.sparkSubmit, "--py-files", zip, "--master", "local-cluster[1,1,512]", script],
+            [self.sparkSubmit, "--py-files", zip, "--master",
+                "local-cluster[1,1,512]", script],
             stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
@@ -911,6 +946,7 @@ def test_single_script_on_cluster(self):
 
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
+
     """General PySpark tests that depend on scipy """
 
     def test_serialize(self):
@@ -923,15 +959,17 @@ def test_serialize(self):
 
 @unittest.skipIf(not _have_numpy, "NumPy not installed")
 class NumPyTests(PySparkTestCase):
+
     """General PySpark tests that depend on numpy """
 
     def test_statcounter_array(self):
-        x = self.sc.parallelize([np.array([1.0,1.0]), np.array([2.0,2.0]), np.array([3.0,3.0])])
+        x = self.sc.parallelize(
+            [np.array([1.0, 1.0]), np.array([2.0, 2.0]), np.array([3.0, 3.0])])
         s = x.stats()
-        self.assertSequenceEqual([2.0,2.0], s.mean().tolist())
-        self.assertSequenceEqual([1.0,1.0], s.min().tolist())
-        self.assertSequenceEqual([3.0,3.0], s.max().tolist())
-        self.assertSequenceEqual([1.0,1.0], s.sampleStdev().tolist())
+        self.assertSequenceEqual([2.0, 2.0], s.mean().tolist())
+        self.assertSequenceEqual([1.0, 1.0], s.min().tolist())
+        self.assertSequenceEqual([3.0, 3.0], s.max().tolist())
+        self.assertSequenceEqual([1.0, 1.0], s.sampleStdev().tolist())
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 2770f63059853..14a63256b551f 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -56,8 +56,10 @@ def main(infile, outfile):
         SparkFiles._root_directory = spark_files_dir
         SparkFiles._is_running_on_worker = True
 
-        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
+        # fetch names of includes (*.zip and *.egg files) and construct
+        # PYTHONPATH
+        # *.py files that were added will be copied here
+        sys.path.append(spark_files_dir)
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)
diff --git a/python/test_support/userlibrary.py b/python/test_support/userlibrary.py
index 8e4a6292bc17c..73fd26e71f10d 100755
--- a/python/test_support/userlibrary.py
+++ b/python/test_support/userlibrary.py
@@ -19,6 +19,8 @@
 Used to test shipping of code depenencies with SparkContext.addPyFile().
 """
 
+
 class UserClass(object):
+
     def hello(self):
         return "Hello World!"

From 9a66cb0391794f18733cf54cfe2a337a75aeefb4 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 03:02:30 -0400
Subject: [PATCH 08/23] resolving merge conflicts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

*fingers crossed*

I admit I’m not exactly sure how this works… Let’s see if I did the
right thing.
---
 LICENSE                                       |   1 +
 bin/run-example                               |   3 +-
 bin/run-example2.cmd                          |   3 +-
 .../main/scala/org/apache/spark/Logging.scala |  10 +-
 .../apache/spark/deploy/worker/Worker.scala   |   9 +-
 .../spark/storage/BlockManagerSource.scala    |   5 +-
 .../apache/spark/storage/StorageUtils.scala   |  11 +-
 .../scala/org/apache/spark/DriverSuite.scala  |   2 +-
 .../spark/deploy/SparkSubmitSuite.scala       |   2 +-
 .../deploy/worker/ExecutorRunnerTest.scala    |   2 +-
 dev/audit-release/audit_release.py            |   4 +-
 .../src/main/scala/SparkApp.scala             |   7 +
 dev/audit-release/sbt_app_kinesis/build.sbt   |  28 +
 .../src/main/scala/SparkApp.scala             |  33 +
 .../sbt_app_sql/src/main/scala/SqlApp.scala   |   2 +-
 dev/create-release/create-release.sh          |   4 +-
 dev/run-tests                                 |   3 +
 docs/sql-programming-guide.md                 |  18 +-
 docs/streaming-custom-receivers.md            |   4 +-
 docs/streaming-kinesis.md                     |  58 ++
 docs/streaming-programming-guide.md           |  12 +-
 examples/pom.xml                              |  13 +
 .../spark/examples/sql/JavaSparkSQL.java      |   8 +-
 .../main/python/mllib/decision_tree_runner.py | 133 ++++
 .../main/python/mllib/logistic_regression.py  |   4 +-
 .../spark/examples/sql/RDDRelation.scala      |   4 +-
 .../examples/sql/hive/HiveFromSpark.scala     |   2 +-
 extras/kinesis-asl/pom.xml                    |  96 +++
 .../streaming/JavaKinesisWordCountASL.java    | 180 ++++++
 .../src/main/resources/log4j.properties       |  37 ++
 .../streaming/KinesisWordCountASL.scala       | 251 ++++++++
 .../kinesis/KinesisCheckpointState.scala      |  56 ++
 .../streaming/kinesis/KinesisReceiver.scala   | 149 +++++
 .../kinesis/KinesisRecordProcessor.scala      | 212 +++++++
 .../streaming/kinesis/KinesisUtils.scala      |  96 +++
 .../kinesis/JavaKinesisStreamSuite.java       |  41 ++
 .../src/test/resources/log4j.properties       |  26 +
 .../kinesis/KinesisReceiverSuite.scala        | 275 +++++++++
 .../mllib/api/python/PythonMLLibAPI.scala     |  78 +++
 .../mllib/tree/configuration/Strategy.scala   |   3 +-
 .../spark/mllib/tree/DecisionTreeSuite.scala  |   3 +-
 pom.xml                                       |  19 +-
 project/SparkBuild.scala                      |   6 +-
 python/pyspark/mllib/_common.py               |  34 +-
 python/pyspark/mllib/_common.py.orig          | 572 ++++++++++++++++++
 python/pyspark/mllib/tests.py                 |  36 ++
 python/pyspark/mllib/tree.py                  | 225 +++++++
 python/pyspark/mllib/util.py                  |  15 +-
 python/pyspark/mllib/util.py.orig             | 211 +++++++
 python/pyspark/sql.py                         |  55 +-
 python/run-tests                              |   1 +
 sql/catalyst/pom.xml                          |   5 -
 .../sql/catalyst/analysis/Analyzer.scala      |   4 +-
 .../catalyst/analysis/FunctionRegistry.scala  |  32 +
 .../catalyst/analysis/HiveTypeCoercion.scala  |   8 +-
 .../catalyst/expressions/BoundAttribute.scala |   2 +-
 .../sql/catalyst/expressions/ScalaUdf.scala   | 307 ++++++++++
 .../codegen/GenerateOrdering.scala            |   4 +-
 .../apache/spark/sql/catalyst/package.scala   |   1 -
 .../sql/catalyst/planning/QueryPlanner.scala  |   2 +-
 .../sql/catalyst/planning/patterns.scala      |   6 +-
 .../spark/sql/catalyst/rules/Rule.scala       |   2 +-
 .../sql/catalyst/rules/RuleExecutor.scala     |  12 +-
 .../spark/sql/catalyst/trees/package.scala    |   8 +-
 .../org/apache/spark/sql/api/java/UDF1.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF10.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF11.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF12.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF13.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF14.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF15.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF16.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF17.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF18.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF19.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF2.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF20.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF21.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF22.java  |  32 +
 .../org/apache/spark/sql/api/java/UDF3.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF4.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF5.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF6.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF7.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF8.java   |  32 +
 .../org/apache/spark/sql/api/java/UDF9.java   |  32 +
 .../org/apache/spark/sql/SQLContext.scala     |  17 +-
 .../org/apache/spark/sql/SchemaRDD.scala      |   2 +-
 .../org/apache/spark/sql/SchemaRDDLike.scala  |   5 +-
 .../apache/spark/sql/UdfRegistration.scala    | 196 ++++++
 .../spark/sql/api/java/JavaSQLContext.scala   |   7 +-
 .../spark/sql/api/java/UDFRegistration.scala  | 252 ++++++++
 .../CompressibleColumnBuilder.scala           |   5 +-
 .../apache/spark/sql/execution/Exchange.scala |   2 +-
 .../spark/sql/execution/SparkStrategies.scala |   2 +
 .../apache/spark/sql/execution/joins.scala    |   3 +-
 .../spark/sql/execution/pythonUdfs.scala      | 177 ++++++
 .../org/apache/spark/sql/json/JsonRDD.scala   |   2 +-
 .../scala/org/apache/spark/sql/package.scala  |   2 -
 .../spark/sql/api/java/JavaAPISuite.java      |  90 +++
 .../sql/api/java/JavaApplySchemaSuite.java    |   6 +-
 .../apache/spark/sql/CachedTableSuite.scala   |   2 +-
 .../apache/spark/sql/InsertIntoSuite.scala    |   6 +-
 .../org/apache/spark/sql/JoinSuite.scala      |   4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |   6 +-
 .../sql/ScalaReflectionRelationSuite.scala    |   8 +-
 .../scala/org/apache/spark/sql/TestData.scala |  38 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala |  36 ++
 .../spark/sql/api/java/JavaSQLSuite.scala     |  10 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  |   4 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |  12 +
 .../org/apache/spark/sql/json/JsonSuite.scala |  22 +-
 .../spark/sql/parquet/ParquetQuerySuite.scala |  26 +-
 .../hive/thriftserver/HiveThriftServer2.scala |  12 +-
 .../hive/thriftserver/SparkSQLCLIDriver.scala |   2 +-
 .../hive/thriftserver/SparkSQLDriver.scala    |   6 +-
 .../sql/hive/thriftserver/SparkSQLEnv.scala   |   6 +-
 .../server/SparkSQLOperationManager.scala     |  13 +-
 .../thriftserver/HiveThriftServer2Suite.scala |   2 +-
 .../apache/spark/sql/hive/HiveContext.scala   |  15 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   3 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    |  13 +-
 .../org/apache/spark/sql/hive/TestHive.scala  |  14 +-
 .../org/apache/spark/sql/hive/hiveUdfs.scala  |  10 +-
 .../org/apache/spark/sql/QueryTest.scala      |   4 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala   |   2 +-
 .../sql/hive/api/java/JavaHiveQLSuite.scala   |   4 +-
 .../hive/execution/HiveComparisonTest.scala   |  22 +-
 .../hive/execution/HiveQueryFileTest.scala    |   2 +-
 .../sql/hive/execution/HiveQuerySuite.scala   |   6 +-
 .../hive/execution/HiveResolutionSuite.scala  |   4 +-
 .../spark/sql/parquet/HiveParquetSuite.scala  |   8 +-
 132 files changed, 4986 insertions(+), 263 deletions(-)
 create mode 100644 dev/audit-release/sbt_app_kinesis/build.sbt
 create mode 100644 dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
 create mode 100644 docs/streaming-kinesis.md
 create mode 100755 examples/src/main/python/mllib/decision_tree_runner.py
 create mode 100644 extras/kinesis-asl/pom.xml
 create mode 100644 extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
 create mode 100644 extras/kinesis-asl/src/main/resources/log4j.properties
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
 create mode 100644 extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
 create mode 100644 extras/kinesis-asl/src/test/resources/log4j.properties
 create mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
 create mode 100644 python/pyspark/mllib/_common.py.orig
 create mode 100644 python/pyspark/mllib/tree.py
 create mode 100644 python/pyspark/mllib/util.py.orig
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
 create mode 100644 sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala

diff --git a/LICENSE b/LICENSE
index 76a3601c66918..e9a1153fdc5db 100644
--- a/LICENSE
+++ b/LICENSE
@@ -549,3 +549,4 @@ The following components are provided under the MIT License. See project link fo
      (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/)
      (MIT License) scopt (com.github.scopt:scopt_2.10:3.2.0 - https://github.com/scopt/scopt)
      (The MIT License) Mockito (org.mockito:mockito-all:1.8.5 - http://www.mockito.org)
+     (MIT License) jquery (https://jquery.org/license/)
diff --git a/bin/run-example b/bin/run-example
index 942706d733122..68a35702eddd3 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -29,7 +29,8 @@ if [ -n "$1" ]; then
 else
   echo "Usage: ./bin/run-example <example-class> [example-args]" 1>&2
   echo "  - set MASTER=XX to use a specific master" 1>&2
-  echo "  - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" 1>&2
+  echo "  - can use abbreviated example class name relative to com.apache.spark.examples" 1>&2
+  echo "     (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)" 1>&2
   exit 1
 fi
 
diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd
index eadedd7fa61ff..b29bf90c64e90 100644
--- a/bin/run-example2.cmd
+++ b/bin/run-example2.cmd
@@ -32,7 +32,8 @@ rem Test that an argument was given
 if not "x%1"=="x" goto arg_given
   echo Usage: run-example ^<example-class^> [example-args]
   echo   - set MASTER=XX to use a specific master
-  echo   - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)
+  echo   - can use abbreviated example class name relative to com.apache.spark.examples
+  echo      (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)
   goto exit
 :arg_given
 
diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index 807ef3e9c9d60..d4f2624061e35 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -39,13 +39,17 @@ trait Logging {
   // be serialized and used on another machine
   @transient private var log_ : Logger = null
 
+  // Method to get the logger name for this object
+  protected def logName = {
+    // Ignore trailing $'s in the class names for Scala objects
+    this.getClass.getName.stripSuffix("$")
+  }
+
   // Method to get or create the logger for this object
   protected def log: Logger = {
     if (log_ == null) {
       initializeIfNecessary()
-      var className = this.getClass.getName
-      // Ignore trailing $'s in the class names for Scala objects
-      log_ = LoggerFactory.getLogger(className.stripSuffix("$"))
+      log_ = LoggerFactory.getLogger(logName)
     }
     log_
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index c6ea42fceb659..458d9947bd873 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -71,7 +71,7 @@ private[spark] class Worker(
   // TTL for app folders/data;  after TTL expires it will be cleaned up
   val APP_DATA_RETENTION_SECS = conf.getLong("spark.worker.cleanup.appDataTtl", 7 * 24 * 3600)
 
-
+  val testing: Boolean = sys.props.contains("spark.testing")
   val masterLock: Object = new Object()
   var master: ActorSelection = null
   var masterAddress: Address = null
@@ -82,7 +82,12 @@ private[spark] class Worker(
   @volatile var connected = false
   val workerId = generateWorkerId()
   val sparkHome =
-    new File(sys.props.get("spark.test.home").orElse(sys.env.get("SPARK_HOME")).getOrElse("."))
+    if (testing) {
+      assert(sys.props.contains("spark.test.home"), "spark.test.home is not set!")
+      new File(sys.props("spark.test.home"))
+    } else {
+      new File(sys.env.get("SPARK_HOME").getOrElse("."))
+    }
   var workDir: File = null
   val executors = new HashMap[String, ExecutorRunner]
   val finishedExecutors = new HashMap[String, ExecutorRunner]
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
index e939318a029dd..3f14c40ec61cb 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
@@ -46,9 +46,8 @@ private[spark] class BlockManagerSource(val blockManager: BlockManager, sc: Spar
   metricRegistry.register(MetricRegistry.name("memory", "memUsed_MB"), new Gauge[Long] {
     override def getValue: Long = {
       val storageStatusList = blockManager.master.getStorageStatus
-      val maxMem = storageStatusList.map(_.maxMem).sum
-      val remainingMem = storageStatusList.map(_.memRemaining).sum
-      (maxMem - remainingMem) / 1024 / 1024
+      val memUsed = storageStatusList.map(_.memUsed).sum
+      memUsed / 1024 / 1024
     }
   })
 
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
index 0a0a448baa2ef..2bd6b749be261 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
@@ -172,16 +172,13 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
   def memRemaining: Long = maxMem - memUsed
 
   /** Return the memory used by this block manager. */
-  def memUsed: Long =
-    _nonRddStorageInfo._1 + _rddBlocks.keys.toSeq.map(memUsedByRdd).sum
+  def memUsed: Long = _nonRddStorageInfo._1 + _rddBlocks.keys.toSeq.map(memUsedByRdd).sum
 
   /** Return the disk space used by this block manager. */
-  def diskUsed: Long =
-    _nonRddStorageInfo._2 + _rddBlocks.keys.toSeq.map(diskUsedByRdd).sum
+  def diskUsed: Long = _nonRddStorageInfo._2 + _rddBlocks.keys.toSeq.map(diskUsedByRdd).sum
 
   /** Return the off-heap space used by this block manager. */
-  def offHeapUsed: Long =
-    _nonRddStorageInfo._3 + _rddBlocks.keys.toSeq.map(offHeapUsedByRdd).sum
+  def offHeapUsed: Long = _nonRddStorageInfo._3 + _rddBlocks.keys.toSeq.map(offHeapUsedByRdd).sum
 
   /** Return the memory used by the given RDD in this block manager in O(1) time. */
   def memUsedByRdd(rddId: Int): Long = _rddStorageInfo.get(rddId).map(_._1).getOrElse(0L)
@@ -246,7 +243,7 @@ private[spark] object StorageUtils {
       val rddId = rddInfo.id
       // Assume all blocks belonging to the same RDD have the same storage level
       val storageLevel = statuses
-        .map(_.rddStorageLevel(rddId)).flatMap(s => s).headOption.getOrElse(StorageLevel.NONE)
+        .flatMap(_.rddStorageLevel(rddId)).headOption.getOrElse(StorageLevel.NONE)
       val numCachedPartitions = statuses.map(_.numRddBlocksById(rddId)).sum
       val memSize = statuses.map(_.memUsedByRdd(rddId)).sum
       val diskSize = statuses.map(_.diskUsedByRdd(rddId)).sum
diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
index e36902ec81e08..a73e1ef0288a5 100644
--- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
@@ -34,7 +34,7 @@ import scala.language.postfixOps
 class DriverSuite extends FunSuite with Timeouts {
 
   test("driver should exit after finishing") {
-    val sparkHome = sys.props("spark.test.home")
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     // Regression test for SPARK-530: "Spark driver process doesn't exit after finishing"
     val masters = Table(("master"), ("local"), ("local-cluster[2,1,512]"))
     forAll(masters) { (master: String) =>
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 8126ef1bb23aa..a5cdcfb5de03b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -295,7 +295,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
 
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   def runSparkSubmit(args: Seq[String]): String = {
-    val sparkHome = sys.props("spark.test.home")
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     Utils.executeAndGetOutput(
       Seq("./bin/spark-submit") ++ args,
       new File(sparkHome),
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index 149a2b3d95b86..39ab53cf0b5b1 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -27,7 +27,7 @@ import org.apache.spark.SparkConf
 class ExecutorRunnerTest extends FunSuite {
   test("command includes appId") {
     def f(s:String) = new File(s)
-    val sparkHome = sys.props("spark.test.home")
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     val appDesc = new ApplicationDescription("app name", Some(8), 500,
       Command("foo", Seq(), Map(), Seq(), Seq(), Seq()), "appUiUrl")
     val appId = "12345-worker321-9876"
diff --git a/dev/audit-release/audit_release.py b/dev/audit-release/audit_release.py
index 230e900ecd4de..16ea1a71290dc 100755
--- a/dev/audit-release/audit_release.py
+++ b/dev/audit-release/audit_release.py
@@ -105,7 +105,7 @@ def get_url(url):
     "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl",
     "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka",
     "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq",
-    "spark-catalyst", "spark-sql", "spark-hive"
+    "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl"
 ]
 modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules)
 
@@ -136,7 +136,7 @@ def ensure_path_not_present(x):
 os.chdir(original_dir)
 
 # SBT application tests
-for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive"]:
+for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]:
     os.chdir(app)
     ret = run_cmd("sbt clean run", exit_on_failure=False)
     test(ret == 0, "sbt application (%s)" % app)
diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
index 77bbd167b199a..fc03fec9866a6 100644
--- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
+++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
@@ -50,5 +50,12 @@ object SimpleApp {
       println("Ganglia sink was loaded via spark-core")
       System.exit(-1)
     }
+
+    // Remove kinesis from default build due to ASL license issue
+    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
+    if (foundKinesis) {
+      println("Kinesis was loaded via spark-core")
+      System.exit(-1)
+    }
   }
 }
diff --git a/dev/audit-release/sbt_app_kinesis/build.sbt b/dev/audit-release/sbt_app_kinesis/build.sbt
new file mode 100644
index 0000000000000..981bc7957b5ed
--- /dev/null
+++ b/dev/audit-release/sbt_app_kinesis/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Kinesis Test"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
new file mode 100644
index 0000000000000..9f85066501472
--- /dev/null
+++ b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main.scala
+
+import scala.util.Try
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+
+object SimpleApp {
+  def main(args: Array[String]) {
+    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
+    if (!foundKinesis) {
+      println("Kinesis not loaded via kinesis-asl")
+      System.exit(-1)
+    }
+  }
+}
diff --git a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
index 50af90c213b5a..d888de929fdda 100644
--- a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
+++ b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
@@ -38,7 +38,7 @@ object SparkSqlExample {
 
     import sqlContext._
     val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x))
-    people.registerAsTable("people")
+    people.registerTempTable("people")
     val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
     val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect()
     teenagerNames.foreach(println)
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index af46572e6602b..42473629d4f15 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -53,15 +53,15 @@ if [[ ! "$@" =~ --package-only ]]; then
     -Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
     -Dmaven.javadoc.skip=true \
     -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-    -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl\
     -Dtag=$GIT_TAG -DautoVersionSubmodules=true \
+    -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     --batch-mode release:prepare
 
   mvn -DskipTests \
     -Darguments="-DskipTests=true -Dmaven.javadoc.skip=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
     -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
     -Dmaven.javadoc.skip=true \
-    -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl\
+    -Pyarn -Phive -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     release:perform
 
   cd ..
diff --git a/dev/run-tests b/dev/run-tests
index 59904ba6b6313..32c3f90fa41c8 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -36,6 +36,9 @@ fi
 if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
   export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
 fi
+
+export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
+
 echo "SBT_MAVEN_PROFILES_ARGS=\"$SBT_MAVEN_PROFILES_ARGS\""
 
 # Remove work directory
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7261badd411a9..0465468084cee 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -142,7 +142,7 @@ case class Person(name: String, age: Int)
 
 // Create an RDD of Person objects and register it as a table.
 val people = sc.textFile("examples/src/main/resources/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
-people.registerAsTable("people")
+people.registerTempTable("people")
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -210,7 +210,7 @@ JavaRDD<Person> people = sc.textFile("examples/src/main/resources/people.txt").m
 
 // Apply a schema to an RDD of JavaBeans and register it as a table.
 JavaSchemaRDD schemaPeople = sqlContext.applySchema(people, Person.class);
-schemaPeople.registerAsTable("people");
+schemaPeople.registerTempTable("people");
 
 // SQL can be run over RDDs that have been registered as tables.
 JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -248,7 +248,7 @@ people = parts.map(lambda p: {"name": p[0], "age": int(p[1])})
 # In future versions of PySpark we would like to add support for registering RDDs with other
 # datatypes as tables
 schemaPeople = sqlContext.inferSchema(people)
-schemaPeople.registerAsTable("people")
+schemaPeople.registerTempTable("people")
 
 # SQL can be run over SchemaRDDs that have been registered as a table.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -292,7 +292,7 @@ people.saveAsParquetFile("people.parquet")
 val parquetFile = sqlContext.parquetFile("people.parquet")
 
 //Parquet files can also be registered as tables and then used in SQL statements.
-parquetFile.registerAsTable("parquetFile")
+parquetFile.registerTempTable("parquetFile")
 val teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
 teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 {% endhighlight %}
@@ -314,7 +314,7 @@ schemaPeople.saveAsParquetFile("people.parquet");
 JavaSchemaRDD parquetFile = sqlContext.parquetFile("people.parquet");
 
 //Parquet files can also be registered as tables and then used in SQL statements.
-parquetFile.registerAsTable("parquetFile");
+parquetFile.registerTempTable("parquetFile");
 JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
 List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
   public String call(Row row) {
@@ -340,7 +340,7 @@ schemaPeople.saveAsParquetFile("people.parquet")
 parquetFile = sqlContext.parquetFile("people.parquet")
 
 # Parquet files can also be registered as tables and then used in SQL statements.
-parquetFile.registerAsTable("parquetFile");
+parquetFile.registerTempTable("parquetFile");
 teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
 for teenName in teenNames.collect():
@@ -378,7 +378,7 @@ people.printSchema()
 //  |-- name: StringType
 
 // Register this SchemaRDD as a table.
-people.registerAsTable("people")
+people.registerTempTable("people")
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
@@ -416,7 +416,7 @@ people.printSchema();
 //  |-- name: StringType
 
 // Register this JavaSchemaRDD as a table.
-people.registerAsTable("people");
+people.registerTempTable("people");
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
@@ -455,7 +455,7 @@ people.printSchema()
 #  |-- name: StringType
 
 # Register this SchemaRDD as a table.
-people.registerAsTable("people")
+people.registerTempTable("people")
 
 # SQL statements can be run by using the sql methods provided by sqlContext.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md
index a2dc3a8961dfc..1e045a3dd0ca9 100644
--- a/docs/streaming-custom-receivers.md
+++ b/docs/streaming-custom-receivers.md
@@ -4,7 +4,7 @@ title: Spark Streaming Custom Receivers
 ---
 
 Spark Streaming can receive streaming data from any arbitrary data source beyond
-the one's for which it has in-built support (that is, beyond Flume, Kafka, files, sockets, etc.).
+the one's for which it has in-built support (that is, beyond Flume, Kafka, Kinesis, files, sockets, etc.).
 This requires the developer to implement a *receiver* that is customized for receiving data from
 the concerned data source. This guide walks through the process of implementing a custom receiver
 and using it in a Spark Streaming application.
@@ -174,7 +174,7 @@ val words = lines.flatMap(_.split(" "))
 ...
 {% endhighlight %}
 
-The full source code is in the example [CustomReceiver.scala](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/streaming/examples/CustomReceiver.scala).
+The full source code is in the example [CustomReceiver.scala](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala).
 
 </div>
 <div data-lang="java" markdown="1">
diff --git a/docs/streaming-kinesis.md b/docs/streaming-kinesis.md
new file mode 100644
index 0000000000000..801c905c88df8
--- /dev/null
+++ b/docs/streaming-kinesis.md
@@ -0,0 +1,58 @@
+---
+layout: global
+title: Spark Streaming Kinesis Receiver
+---
+
+### Kinesis
+Build notes:
+<li>Spark supports a Kinesis Streaming Receiver which is not included in the default build due to licensing restrictions.</li>
+<li>_**Note that by embedding this library you will include [ASL](https://aws.amazon.com/asl/)-licensed code in your Spark package**_.</li>
+<li>The Spark Kinesis Streaming Receiver source code, examples, tests, and artifacts live in $SPARK_HOME/extras/kinesis-asl.</li>
+<li>To build with Kinesis, you must run the maven or sbt builds with -Pkinesis-asl`.</li>
+<li>Applications will need to link to the 'spark-streaming-kinesis-asl` artifact.</li>
+
+Kinesis examples notes:
+<li>To build the Kinesis examples, you must run the maven or sbt builds with -Pkinesis-asl`.</li>
+<li>These examples automatically determine the number of local threads and KinesisReceivers to spin up based on the number of shards for the stream.</li>
+<li>KinesisWordCountProducerASL will generate random data to put onto the Kinesis stream for testing.</li>
+<li>Checkpointing is disabled (no checkpoint dir is set).  The examples as written will not recover from a driver failure.</li>
+
+Deployment and runtime notes:
+<li>A single KinesisReceiver can process many shards of a stream.</li>
+<li>Each shard of a stream is processed by one or more KinesisReceiver's managed by the Kinesis Client Library (KCL) Worker.</li>
+<li>You never need more KinesisReceivers than the number of shards in your stream.</li>
+<li>You can horizontally scale the receiving by creating more KinesisReceiver/DStreams (up to the number of shards for a given stream)</li>
+<li>The Kinesis libraries must be present on all worker nodes, as they will need access to the Kinesis Client Library.</li>
+<li>This code uses the DefaultAWSCredentialsProviderChain and searches for credentials in the following order of precedence:<br/>
+    1) Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY<br/>
+    2) Java System Properties - aws.accessKeyId and aws.secretKey<br/>
+    3) Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs<br/>
+    4) Instance profile credentials - delivered through the Amazon EC2 metadata service<br/>
+</li>
+<li>You need to setup a Kinesis stream with 1 or more shards per the following:<br/>
+ http://docs.aws.amazon.com/kinesis/latest/dev/step-one-create-stream.html</li>
+<li>Valid Kinesis endpoint urls can be found here:  Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region</li>
+<li>When you first start up the KinesisReceiver, the Kinesis Client Library (KCL) needs ~30s to establish connectivity with the AWS Kinesis service,
+retrieve any checkpoint data, and negotiate with other KCL's reading from the same stream.</li>
+<li>Be careful when changing the app name.  Kinesis maintains a mapping table in DynamoDB based on this app name (http://docs.aws.amazon.com/kinesis/latest/dev/kinesis-record-processor-implementation-app.html#kinesis-record-processor-initialization).  
+Changing the app name could lead to Kinesis errors as only 1 logical application can process a stream.  In order to start fresh, 
+it's always best to delete the DynamoDB table that matches your app name.  This DynamoDB table lives in us-east-1 regardless of the Kinesis endpoint URL.</li>
+
+Failure recovery notes:
+<li>The combination of Spark Streaming and Kinesis creates 3 different checkpoints as follows:<br/>
+  1) RDD data checkpoint (Spark Streaming) - frequency is configurable with DStream.checkpoint(Duration)<br/>
+  2) RDD metadata checkpoint (Spark Streaming) - frequency is every DStream batch<br/>
+  3) Kinesis checkpointing (Kinesis) - frequency is controlled by the developer calling ICheckpointer.checkpoint() directly<br/>
+</li>
+<li>Checkpointing too frequently will cause excess load on the AWS checkpoint storage layer and may lead to AWS throttling</li>
+<li>Upon startup, a KinesisReceiver will begin processing records with sequence numbers greater than the last checkpoint sequence number recorded per shard.</li>
+<li>If no checkpoint info exists, the worker will start either from the oldest record available (InitialPositionInStream.TRIM_HORIZON)
+or from the tip/latest (InitialPostitionInStream.LATEST).  This is configurable.</li>
+<li>When pulling from the stream tip (InitialPositionInStream.LATEST), only new stream data will be picked up after the KinesisReceiver starts.</li>
+<li>InitialPositionInStream.LATEST could lead to missed records if data is added to the stream while no KinesisReceivers are running.</li>
+<li>In production, you'll want to switch to InitialPositionInStream.TRIM_HORIZON which will read up to 24 hours (Kinesis limit) of previous stream data
+depending on the checkpoint frequency.</li>
+<li>InitialPositionInStream.TRIM_HORIZON may lead to duplicate processing of records depending on the checkpoint frequency.</li>
+<li>Record processing should be idempotent when possible.</li>
+<li>Failed or latent KinesisReceivers will be detected and automatically shutdown/load-balanced by the KCL.</li>
+<li>If possible, explicitly shutdown the worker if a failure occurs in order to trigger the final checkpoint.</li>
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 7b8b7933434c4..9f331ed50d2a4 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -9,7 +9,7 @@ title: Spark Streaming Programming Guide
 # Overview
 Spark Streaming is an extension of the core Spark API that allows enables high-throughput,
 fault-tolerant stream processing of live data streams. Data can be ingested from many sources
-like Kafka, Flume, Twitter, ZeroMQ or plain old TCP sockets and be processed using complex
+like Kafka, Flume, Twitter, ZeroMQ, Kinesis or plain old TCP sockets and be processed using complex
 algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`.
 Finally, processed data can be pushed out to filesystems, databases,
 and live dashboards. In fact, you can apply Spark's in-built
@@ -38,7 +38,7 @@ stream of results in batches.
 
 Spark Streaming provides a high-level abstraction called *discretized stream* or *DStream*,
 which represents a continuous stream of data. DStreams can be created either from input data
-stream from sources such as Kafka and Flume, or by applying high-level
+stream from sources such as Kafka, Flume, and Kinesis, or by applying high-level
 operations on other DStreams. Internally, a DStream is represented as a sequence of
 [RDDs](api/scala/index.html#org.apache.spark.rdd.RDD).
 
@@ -313,7 +313,7 @@ To write your own Spark Streaming program, you will have to add the following de
     artifactId = spark-streaming_{{site.SCALA_BINARY_VERSION}}
     version = {{site.SPARK_VERSION}}
 
-For ingesting data from sources like Kafka and Flume that are not present in the Spark
+For ingesting data from sources like Kafka, Flume, and Kinesis that are not present in the Spark
 Streaming core
  API, you will have to add the corresponding
 artifact `spark-streaming-xyz_{{site.SCALA_BINARY_VERSION}}` to the dependencies. For example,
@@ -327,6 +327,7 @@ some of the common ones are as follows.
 <tr><td> Twitter </td><td> spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> ZeroMQ </td><td> spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> MQTT </td><td> spark-streaming-mqtt_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> Kinesis<br/>(built separately)</td><td> kinesis-asl_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> </td><td></td></tr>
 </table>
 
@@ -442,7 +443,7 @@ see the API documentations of the relevant functions in
 Scala and [JavaStreamingContext](api/scala/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext)
  for Java.
 
-Additional functionality for creating DStreams from sources such as Kafka, Flume, and Twitter
+Additional functionality for creating DStreams from sources such as Kafka, Flume, Kinesis, and Twitter
 can be imported by adding the right dependencies as explained in an
 [earlier](#linking) section. To take the
 case of Kafka, after adding the artifact `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` to the
@@ -467,6 +468,9 @@ For more details on these additional sources, see the corresponding [API documen
 Furthermore, you can also implement your own custom receiver for your sources. See the
 [Custom Receiver Guide](streaming-custom-receivers.html).
 
+### Kinesis
+[Kinesis](streaming-kinesis.html)
+
 ## Operations
 There are two kinds of DStream operations - _transformations_ and _output operations_. Similar to
 RDD transformations, DStream transformations operate on one or more DStreams to create new DStreams
diff --git a/examples/pom.xml b/examples/pom.xml
index c4ed0f5a6a02b..8c4c128bb484d 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -34,6 +34,19 @@
   <name>Spark Project Examples</name>
   <url>http://spark.apache.org/</url>
 
+  <profiles>
+    <profile>
+      <id>kinesis-asl</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-streaming-kinesis-asl_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+  
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 607df3eddd550..898297dc658ba 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -74,7 +74,7 @@ public Person call(String line) throws Exception {
 
     // Apply a schema to an RDD of Java Beans and register it as a table.
     JavaSchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
-    schemaPeople.registerAsTable("people");
+    schemaPeople.registerTempTable("people");
 
     // SQL can be run over RDDs that have been registered as tables.
     JavaSchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
@@ -100,7 +100,7 @@ public String call(Row row) {
     JavaSchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");
 
     //Parquet files can also be registered as tables and then used in SQL statements.
-    parquetFile.registerAsTable("parquetFile");
+    parquetFile.registerTempTable("parquetFile");
     JavaSchemaRDD teenagers2 =
       sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
     teenagerNames = teenagers2.map(new Function<Row, String>() {
@@ -128,7 +128,7 @@ public String call(Row row) {
     //  |-- name: StringType
 
     // Register this JavaSchemaRDD as a table.
-    peopleFromJsonFile.registerAsTable("people");
+    peopleFromJsonFile.registerTempTable("people");
 
     // SQL statements can be run by using the sql methods provided by sqlCtx.
     JavaSchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
@@ -158,7 +158,7 @@ public String call(Row row) {
     //  |    |-- state: StringType
     //  |-- name: StringType
 
-    peopleFromJsonRDD.registerAsTable("people2");
+    peopleFromJsonRDD.registerTempTable("people2");
 
     JavaSchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
     List<String> nameAndCity = peopleWithCity.map(new Function<Row, String>() {
diff --git a/examples/src/main/python/mllib/decision_tree_runner.py b/examples/src/main/python/mllib/decision_tree_runner.py
new file mode 100755
index 0000000000000..8efadb5223f56
--- /dev/null
+++ b/examples/src/main/python/mllib/decision_tree_runner.py
@@ -0,0 +1,133 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Decision tree classification and regression using MLlib.
+"""
+
+import numpy, os, sys
+
+from operator import add
+
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.tree import DecisionTree
+from pyspark.mllib.util import MLUtils
+
+
+def getAccuracy(dtModel, data):
+    """
+    Return accuracy of DecisionTreeModel on the given RDD[LabeledPoint].
+    """
+    seqOp = (lambda acc, x: acc + (x[0] == x[1]))
+    predictions = dtModel.predict(data.map(lambda x: x.features))
+    truth = data.map(lambda p: p.label)
+    trainCorrect = predictions.zip(truth).aggregate(0, seqOp, add)
+    if data.count() == 0:
+        return 0
+    return trainCorrect / (0.0 + data.count())
+
+
+def getMSE(dtModel, data):
+    """
+    Return mean squared error (MSE) of DecisionTreeModel on the given
+    RDD[LabeledPoint].
+    """
+    seqOp = (lambda acc, x: acc + numpy.square(x[0] - x[1]))
+    predictions = dtModel.predict(data.map(lambda x: x.features))
+    truth = data.map(lambda p: p.label)
+    trainMSE = predictions.zip(truth).aggregate(0, seqOp, add)
+    if data.count() == 0:
+        return 0
+    return trainMSE / (0.0 + data.count())
+
+
+def reindexClassLabels(data):
+    """
+    Re-index class labels in a dataset to the range {0,...,numClasses-1}.
+    If all labels in that range already appear at least once,
+     then the returned RDD is the same one (without a mapping).
+    Note: If a label simply does not appear in the data,
+          the index will not include it.
+          Be aware of this when reindexing subsampled data.
+    :param data: RDD of LabeledPoint where labels are integer values
+                 denoting labels for a classification problem.
+    :return: Pair (reindexedData, origToNewLabels) where
+             reindexedData is an RDD of LabeledPoint with labels in
+              the range {0,...,numClasses-1}, and
+             origToNewLabels is a dictionary mapping original labels
+              to new labels.
+    """
+    # classCounts: class --> # examples in class
+    classCounts = data.map(lambda x: x.label).countByValue()
+    numExamples = sum(classCounts.values())
+    sortedClasses = sorted(classCounts.keys())
+    numClasses = len(classCounts)
+    # origToNewLabels: class --> index in 0,...,numClasses-1
+    if (numClasses < 2):
+        print >> sys.stderr, \
+            "Dataset for classification should have at least 2 classes." + \
+            " The given dataset had only %d classes." % numClasses
+        exit(1)
+    origToNewLabels = dict([(sortedClasses[i], i) for i in range(0, numClasses)])
+
+    print "numClasses = %d" % numClasses
+    print "Per-class example fractions, counts:"
+    print "Class\tFrac\tCount"
+    for c in sortedClasses:
+        frac = classCounts[c] / (numExamples + 0.0)
+        print "%g\t%g\t%d" % (c, frac, classCounts[c])
+
+    if (sortedClasses[0] == 0 and sortedClasses[-1] == numClasses - 1):
+        return (data, origToNewLabels)
+    else:
+        reindexedData = \
+            data.map(lambda x: LabeledPoint(origToNewLabels[x.label], x.features))
+        return (reindexedData, origToNewLabels)
+
+
+def usage():
+    print >> sys.stderr, \
+        "Usage: decision_tree_runner [libsvm format data filepath]\n" + \
+        " Note: This only supports binary classification."
+    exit(1)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 2:
+        usage()
+    sc = SparkContext(appName="PythonDT")
+
+    # Load data.
+    dataPath = 'data/mllib/sample_libsvm_data.txt'
+    if len(sys.argv) == 2:
+        dataPath = sys.argv[1]
+    if not os.path.isfile(dataPath):
+        usage()
+    points = MLUtils.loadLibSVMFile(sc, dataPath)
+
+    # Re-index class labels if needed.
+    (reindexedData, origToNewLabels) = reindexClassLabels(points)
+
+    # Train a classifier.
+    model = DecisionTree.trainClassifier(reindexedData, numClasses=2)
+    # Print learned tree and stats.
+    print "Trained DecisionTree for classification:"
+    print "  Model numNodes: %d\n" % model.numNodes()
+    print "  Model depth: %d\n" % model.depth()
+    print "  Training accuracy: %g\n" % getAccuracy(model, reindexedData)
+    print model
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
index 6e0f7a4ee5a81..9d547ff77c984 100755
--- a/examples/src/main/python/mllib/logistic_regression.py
+++ b/examples/src/main/python/mllib/logistic_regression.py
@@ -30,8 +30,10 @@
 from pyspark.mllib.classification import LogisticRegressionWithSGD
 
 
-# Parse a line of text into an MLlib LabeledPoint object
 def parsePoint(line):
+    """
+    Parse a line of text into an MLlib LabeledPoint object.
+    """
     values = [float(s) for s in line.split(' ')]
     if values[0] == -1:   # Convert -1 labels to 0 for MLlib
         values[0] = 0
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index 63db688bfb8c0..d56d64c564200 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -36,7 +36,7 @@ object RDDRelation {
     val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
     // Any RDD containing case classes can be registered as a table.  The schema of the table is
     // automatically inferred using scala reflection.
-    rdd.registerAsTable("records")
+    rdd.registerTempTable("records")
 
     // Once tables have been registered, you can run SQL queries over them.
     println("Result of SELECT *:")
@@ -66,7 +66,7 @@ object RDDRelation {
     parquetFile.where('key === 1).select('value as 'a).collect().foreach(println)
 
     // These files can also be registered as tables.
-    parquetFile.registerAsTable("parquetFile")
+    parquetFile.registerTempTable("parquetFile")
     sql("SELECT * FROM parquetFile").collect().foreach(println)
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index dc5290fb4f10e..12530c8490b09 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -56,7 +56,7 @@ object HiveFromSpark {
 
     // You can also register RDDs as temporary tables within a HiveContext.
     val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
-    rdd.registerAsTable("records")
+    rdd.registerTempTable("records")
 
     // Queries can then join RDD data with data stored in Hive.
     println("Result of SELECT *:")
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
new file mode 100644
index 0000000000000..a54b34235dfb4
--- /dev/null
+++ b/extras/kinesis-asl/pom.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+~ Licensed to the Apache Software Foundation (ASF) under one or more
+~ contributor license agreements.  See the NOTICE file distributed with
+~ this work for additional information regarding copyright ownership.
+~ The ASF licenses this file to You under the Apache License, Version 2.0
+~ (the "License"); you may not use this file except in compliance with
+~ the License.  You may obtain a copy of the License at
+~
+~    http://www.apache.org/licenses/LICENSE-2.0
+~
+~ Unless required by applicable law or agreed to in writing, software
+~ distributed under the License is distributed on an "AS IS" BASIS,
+~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~ See the License for the specific language governing permissions and
+~ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>1.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <!-- Kinesis integration is not included by default due to ASL-licensed code. -->
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-streaming-kinesis-asl_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Kinesis Integration</name>
+
+  <properties>
+    <sbt.project.name>kinesis-asl</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>amazon-kinesis-client</artifactId>
+      <version>${aws.kinesis.client.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>aws-java-sdk</artifactId>
+      <version>${aws.java.sdk.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymockclassextension</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
new file mode 100644
index 0000000000000..a8b907b241893
--- /dev/null
+++ b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.streaming;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.storage.StorageLevel;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import org.apache.spark.streaming.kinesis.KinesisUtils;
+
+import scala.Tuple2;
+
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
+import com.amazonaws.services.kinesis.AmazonKinesisClient;
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+import com.google.common.collect.Lists;
+
+/**
+ * Java-friendly Kinesis Spark Streaming WordCount example
+ *
+ * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details
+ * on the Kinesis Spark Streaming integration.
+ *
+ * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard
+ *   for the given stream.
+ * It then starts pulling from the last checkpointed sequence number of the given
+ *   <stream-name> and <endpoint-url>. 
+ *
+ * Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region
+ *
+ * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials 
+ *  in the following order of precedence: 
+ *         Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+ *         Java System Properties - aws.accessKeyId and aws.secretKey
+ *         Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+ *         Instance profile credentials - delivered through the Amazon EC2 metadata service
+ *
+ * Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>
+ *         <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
+ *         <endpoint-url> is the endpoint of the Kinesis service 
+ *           (ie. https://kinesis.us-east-1.amazonaws.com)
+ *
+ * Example:
+ *      $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *      $ export AWS_SECRET_KEY=<your-secret-key>
+ *      $ $SPARK_HOME/bin/run-example \
+ *            org.apache.spark.examples.streaming.JavaKinesisWordCountASL mySparkStream \
+ *            https://kinesis.us-east-1.amazonaws.com
+ *
+ * There is a companion helper class called KinesisWordCountProducerASL which puts dummy data 
+ *   onto the Kinesis stream. 
+ * Usage instructions for KinesisWordCountProducerASL are provided in the class definition.
+ */
+public final class JavaKinesisWordCountASL {
+    private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");
+    private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class);
+
+    /* Make the constructor private to enforce singleton */
+    private JavaKinesisWordCountASL() {
+    }
+
+    public static void main(String[] args) {
+        /* Check that all required args were passed in. */
+        if (args.length < 2) {
+          System.err.println(
+              "|Usage: KinesisWordCount <stream-name> <endpoint-url>\n" +
+              "|    <stream-name> is the name of the Kinesis stream\n" +
+              "|    <endpoint-url> is the endpoint of the Kinesis service\n" +
+              "|                   (e.g. https://kinesis.us-east-1.amazonaws.com)\n");
+          System.exit(1);
+        }
+
+        StreamingExamples.setStreamingLogLevels();
+
+        /* Populate the appropriate variables from the given args */
+        String streamName = args[0];
+        String endpointUrl = args[1];
+        /* Set the batch interval to a fixed 2000 millis (2 seconds) */
+        Duration batchInterval = new Duration(2000);
+
+        /* Create a Kinesis client in order to determine the number of shards for the given stream */
+        AmazonKinesisClient kinesisClient = new AmazonKinesisClient(
+                new DefaultAWSCredentialsProviderChain());
+        kinesisClient.setEndpoint(endpointUrl);
+
+        /* Determine the number of shards from the stream */
+        int numShards = kinesisClient.describeStream(streamName)
+                .getStreamDescription().getShards().size();
+
+        /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ 
+        int numStreams = numShards;
+
+        /* Must add 1 more thread than the number of receivers or the output won't show properly from the driver */
+        int numSparkThreads = numStreams + 1;
+
+        /* Setup the Spark config. */
+        SparkConf sparkConfig = new SparkConf().setAppName("KinesisWordCount").setMaster(
+                "local[" + numSparkThreads + "]");
+
+        /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
+        Duration checkpointInterval = batchInterval;
+
+        /* Setup the StreamingContext */
+        JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);
+
+        /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+        List<JavaDStream<byte[]>> streamsList = new ArrayList<JavaDStream<byte[]>>(numStreams);
+        for (int i = 0; i < numStreams; i++) {
+        	streamsList.add(
+                KinesisUtils.createStream(jssc, streamName, endpointUrl, checkpointInterval, 
+                InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2())
+            );
+        }
+
+        /* Union all the streams if there is more than 1 stream */
+        JavaDStream<byte[]> unionStreams;
+        if (streamsList.size() > 1) {
+            unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
+        } else {
+            /* Otherwise, just use the 1 stream */
+            unionStreams = streamsList.get(0);
+        }
+
+        /*
+         * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection.
+         * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR.
+         */
+        JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
+                @Override
+                public Iterable<String> call(byte[] line) {
+                    return Lists.newArrayList(WORD_SEPARATOR.split(new String(line)));
+                }
+            });
+
+        /* Map each word to a (word, 1) tuple, then reduce/aggregate by word. */
+        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
+            new PairFunction<String, String, Integer>() {
+                @Override
+                public Tuple2<String, Integer> call(String s) {
+                    return new Tuple2<String, Integer>(s, 1);
+                }
+            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
+                @Override
+                public Integer call(Integer i1, Integer i2) {
+                  return i1 + i2;
+                }
+            });
+
+        /* Print the first 10 wordCounts */
+        wordCounts.print();
+
+        /* Start the streaming context and await termination */
+        jssc.start();
+        jssc.awaitTermination();
+    }
+}
diff --git a/extras/kinesis-asl/src/main/resources/log4j.properties b/extras/kinesis-asl/src/main/resources/log4j.properties
new file mode 100644
index 0000000000000..97348fb5b6123
--- /dev/null
+++ b/extras/kinesis-asl/src/main/resources/log4j.properties
@@ -0,0 +1,37 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+log4j.rootCategory=WARN, console
+
+# File appender
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=false
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
+
+# Console appender
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.out
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
\ No newline at end of file
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
new file mode 100644
index 0000000000000..d03edf8b30a9f
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming
+
+import java.nio.ByteBuffer
+import scala.util.Random
+import org.apache.spark.Logging
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Milliseconds
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions
+import org.apache.spark.streaming.kinesis.KinesisUtils
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+import com.amazonaws.services.kinesis.AmazonKinesisClient
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.model.PutRecordRequest
+import org.apache.log4j.Logger
+import org.apache.log4j.Level
+
+/**
+ * Kinesis Spark Streaming WordCount example.
+ *
+ * See http://spark.apache.org/docs/latest/streaming-kinesis.html for more details on
+ *   the Kinesis Spark Streaming integration.
+ *
+ * This example spins up 1 Kinesis Worker (Spark Streaming Receiver) per shard 
+ *   for the given stream.
+ * It then starts pulling from the last checkpointed sequence number of the given 
+ *   <stream-name> and <endpoint-url>. 
+ *
+ * Valid endpoint urls:  http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region
+ * 
+ * This code uses the DefaultAWSCredentialsProviderChain and searches for credentials
+ *   in the following order of precedence:
+ * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+ * Java System Properties - aws.accessKeyId and aws.secretKey
+ * Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+ * Instance profile credentials - delivered through the Amazon EC2 metadata service
+ *
+ * Usage: KinesisWordCountASL <stream-name> <endpoint-url>
+ *   <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
+ *   <endpoint-url> is the endpoint of the Kinesis service
+ *     (ie. https://kinesis.us-east-1.amazonaws.com)
+ *
+ * Example:
+ *    $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *    $ export AWS_SECRET_KEY=<your-secret-key>
+ *    $ $SPARK_HOME/bin/run-example \
+ *        org.apache.spark.examples.streaming.KinesisWordCountASL mySparkStream \
+ *        https://kinesis.us-east-1.amazonaws.com
+ *
+ * There is a companion helper class below called KinesisWordCountProducerASL which puts
+ *   dummy data onto the Kinesis stream.
+ * Usage instructions for KinesisWordCountProducerASL are provided in that class definition.
+ */
+object KinesisWordCountASL extends Logging {
+  def main(args: Array[String]) {
+    /* Check that all required args were passed in. */
+    if (args.length < 2) {
+      System.err.println(
+        """
+          |Usage: KinesisWordCount <stream-name> <endpoint-url>
+          |    <stream-name> is the name of the Kinesis stream
+          |    <endpoint-url> is the endpoint of the Kinesis service
+          |                   (e.g. https://kinesis.us-east-1.amazonaws.com)
+        """.stripMargin)
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    /* Populate the appropriate variables from the given args */
+    val Array(streamName, endpointUrl) = args
+
+    /* Determine the number of shards from the stream */
+    val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
+    kinesisClient.setEndpoint(endpointUrl)
+    val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards()
+      .size()
+
+    /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */
+    val numStreams = numShards
+
+    /* 
+     *  numSparkThreads should be 1 more thread than the number of receivers.
+     *  This leaves one thread available for actually processing the data.
+     */
+    val numSparkThreads = numStreams + 1
+
+    /* Setup the and SparkConfig and StreamingContext */
+    /* Spark Streaming batch interval */
+    val batchInterval = Milliseconds(2000)    
+    val sparkConfig = new SparkConf().setAppName("KinesisWordCount")
+      .setMaster(s"local[$numSparkThreads]")
+    val ssc = new StreamingContext(sparkConfig, batchInterval)
+
+    /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
+    val kinesisCheckpointInterval = batchInterval
+
+    /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+    val kinesisStreams = (0 until numStreams).map { i =>
+      KinesisUtils.createStream(ssc, streamName, endpointUrl, kinesisCheckpointInterval,
+          InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
+    }
+
+    /* Union all the streams */
+    val unionStreams = ssc.union(kinesisStreams)
+
+    /* Convert each line of Array[Byte] to String, split into words, and count them */
+    val words = unionStreams.flatMap(byteArray => new String(byteArray)
+      .split(" "))
+
+    /* Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */
+    val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
+
+    /* Print the first 10 wordCounts */
+    wordCounts.print()
+
+    /* Start the streaming context and await termination */
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
+
+/**
+ * Usage: KinesisWordCountProducerASL <stream-name> <kinesis-endpoint-url>
+ *     <recordsPerSec> <wordsPerRecord>
+ *   <stream-name> is the name of the Kinesis stream (ie. mySparkStream)
+ *   <kinesis-endpoint-url> is the endpoint of the Kinesis service
+ *     (ie. https://kinesis.us-east-1.amazonaws.com)
+ *   <records-per-sec> is the rate of records per second to put onto the stream
+ *   <words-per-record> is the rate of records per second to put onto the stream
+ *
+ * Example:
+ *    $ export AWS_ACCESS_KEY_ID=<your-access-key>
+ *    $ export AWS_SECRET_KEY=<your-secret-key>
+ *    $ $SPARK_HOME/bin/run-example \
+ *         org.apache.spark.examples.streaming.KinesisWordCountProducerASL mySparkStream \
+ *         https://kinesis.us-east-1.amazonaws.com 10 5
+ */
+object KinesisWordCountProducerASL {
+  def main(args: Array[String]) {
+    if (args.length < 4) {
+      System.err.println("Usage: KinesisWordCountProducerASL <stream-name> <endpoint-url>" +
+          " <records-per-sec> <words-per-record>")
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    /* Populate the appropriate variables from the given args */
+    val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args
+
+    /* Generate the records and return the totals */
+    val totals = generate(stream, endpoint, recordsPerSecond.toInt, wordsPerRecord.toInt)
+
+    /* Print the array of (index, total) tuples */
+    println("Totals")
+    totals.foreach(total => println(total.toString()))
+  }
+
+  def generate(stream: String,
+      endpoint: String,
+      recordsPerSecond: Int,
+      wordsPerRecord: Int): Seq[(Int, Int)] = {
+
+    val MaxRandomInts = 10
+
+    /* Create the Kinesis client */
+    val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
+    kinesisClient.setEndpoint(endpoint)
+
+    println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" +
+      s" $recordsPerSecond records per second and $wordsPerRecord words per record");
+
+    val totals = new Array[Int](MaxRandomInts)
+    /* Put String records onto the stream per the given recordPerSec and wordsPerRecord */
+    for (i <- 1 to 5) {
+
+      /* Generate recordsPerSec records to put onto the stream */
+      val records = (1 to recordsPerSecond.toInt).map { recordNum =>
+        /* 
+         *  Randomly generate each wordsPerRec words between 0 (inclusive)
+         *  and MAX_RANDOM_INTS (exclusive) 
+         */
+        val data = (1 to wordsPerRecord.toInt).map(x => {
+          /* Generate the random int */
+          val randomInt = Random.nextInt(MaxRandomInts)
+
+          /* Keep track of the totals */
+          totals(randomInt) += 1
+
+          randomInt.toString()
+        }).mkString(" ")
+
+        /* Create a partitionKey based on recordNum */
+        val partitionKey = s"partitionKey-$recordNum"
+
+        /* Create a PutRecordRequest with an Array[Byte] version of the data */
+        val putRecordRequest = new PutRecordRequest().withStreamName(stream)
+            .withPartitionKey(partitionKey)
+            .withData(ByteBuffer.wrap(data.getBytes()));
+
+        /* Put the record onto the stream and capture the PutRecordResult */
+        val putRecordResult = kinesisClient.putRecord(putRecordRequest);
+      }
+
+      /* Sleep for a second */
+      Thread.sleep(1000)
+      println("Sent " + recordsPerSecond + " records")
+    }
+
+    /* Convert the totals to (index, total) tuple */
+    (0 to (MaxRandomInts - 1)).zip(totals)
+  }
+}
+
+/** 
+ *  Utility functions for Spark Streaming examples. 
+ *  This has been lifted from the examples/ project to remove the circular dependency.
+ */
+object StreamingExamples extends Logging {
+
+  /** Set reasonable logging levels for streaming if the user has not configured log4j. */
+  def setStreamingLogLevels() {
+    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
+    if (!log4jInitialized) {
+      // We first log something to initialize Spark's default logging, then we override the
+      // logging level.
+      logInfo("Setting log level to [WARN] for streaming example." +
+        " To override add a custom log4j.properties to the classpath.")
+      Logger.getRootLogger.setLevel(Level.WARN)
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
new file mode 100644
index 0000000000000..0b80b611cdce7
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.Logging
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.util.Clock
+import org.apache.spark.streaming.util.ManualClock
+import org.apache.spark.streaming.util.SystemClock
+
+/**
+ * This is a helper class for managing checkpoint clocks.
+ *
+ * @param checkpointInterval 
+ * @param currentClock.  Default to current SystemClock if none is passed in (mocking purposes)
+ */
+private[kinesis] class KinesisCheckpointState(
+    checkpointInterval: Duration, 
+    currentClock: Clock = new SystemClock())
+  extends Logging {
+  
+  /* Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */
+  val checkpointClock = new ManualClock()
+  checkpointClock.setTime(currentClock.currentTime() + checkpointInterval.milliseconds)
+
+  /**
+   * Check if it's time to checkpoint based on the current time and the derived time 
+   *   for the next checkpoint
+   *
+   * @return true if it's time to checkpoint
+   */
+  def shouldCheckpoint(): Boolean = {
+    new SystemClock().currentTime() > checkpointClock.currentTime()
+  }
+
+  /**
+   * Advance the checkpoint clock by the checkpoint interval.
+   */
+  def advanceCheckpoint() = {
+    checkpointClock.addToTime(checkpointInterval.milliseconds)
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
new file mode 100644
index 0000000000000..1bd1f324298e7
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.net.InetAddress
+import java.util.UUID
+
+import org.apache.spark.Logging
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.receiver.Receiver
+
+import com.amazonaws.auth.AWSCredentialsProvider
+import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker
+
+/**
+ * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver.
+ * This implementation relies on the Kinesis Client Library (KCL) Worker as described here:
+ * https://github.com/awslabs/amazon-kinesis-client
+ * This is a custom receiver used with StreamingContext.receiverStream(Receiver) 
+ *   as described here:
+ *     http://spark.apache.org/docs/latest/streaming-custom-receivers.html
+ * Instances of this class will get shipped to the Spark Streaming Workers 
+ *   to run within a Spark Executor.
+ *
+ * @param appName  Kinesis application name. Kinesis Apps are mapped to Kinesis Streams
+ *                 by the Kinesis Client Library.  If you change the App name or Stream name,
+ *                 the KCL will throw errors.  This usually requires deleting the backing  
+ *                 DynamoDB table with the same name this Kinesis application.
+ * @param streamName   Kinesis stream name
+ * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+ * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+ *                            See the Kinesis Spark Streaming documentation for more
+ *                            details on the different types of checkpoints.
+ * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+ *                                 worker's initial starting position in the stream.
+ *                                 The values are either the beginning of the stream
+ *                                 per Kinesis' limit of 24 hours
+ *                                 (InitialPositionInStream.TRIM_HORIZON) or
+ *                                 the tip of the stream (InitialPositionInStream.LATEST).
+ * @param storageLevel Storage level to use for storing the received objects
+ *
+ * @return ReceiverInputDStream[Array[Byte]]   
+ */
+private[kinesis] class KinesisReceiver(
+    appName: String,
+    streamName: String,
+    endpointUrl: String,
+    checkpointInterval: Duration,
+    initialPositionInStream: InitialPositionInStream,
+    storageLevel: StorageLevel)
+  extends Receiver[Array[Byte]](storageLevel) with Logging { receiver =>
+
+  /*
+   * The following vars are built in the onStart() method which executes in the Spark Worker after
+   *   this code is serialized and shipped remotely.
+   */
+
+  /*
+   *  workerId should be based on the ip address of the actual Spark Worker where this code runs
+   *   (not the Driver's ip address.)
+   */
+  var workerId: String = null
+
+  /*
+   * This impl uses the DefaultAWSCredentialsProviderChain and searches for credentials 
+   *   in the following order of precedence:
+   * Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+   * Java System Properties - aws.accessKeyId and aws.secretKey
+   * Credential profiles file at the default location (~/.aws/credentials) shared by all 
+   *   AWS SDKs and the AWS CLI
+   * Instance profile credentials delivered through the Amazon EC2 metadata service
+   */
+  var credentialsProvider: AWSCredentialsProvider = null
+
+  /* KCL config instance. */
+  var kinesisClientLibConfiguration: KinesisClientLibConfiguration = null
+
+  /*
+   *  RecordProcessorFactory creates impls of IRecordProcessor.
+   *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the 
+   *    IRecordProcessor.processRecords() method.
+   *  We're using our custom KinesisRecordProcessor in this case.
+   */
+  var recordProcessorFactory: IRecordProcessorFactory = null
+
+  /*
+   * Create a Kinesis Worker.
+   * This is the core client abstraction from the Kinesis Client Library (KCL).
+   * We pass the RecordProcessorFactory from above as well as the KCL config instance.
+   * A Kinesis Worker can process 1..* shards from the given stream - each with its 
+   *   own RecordProcessor.
+   */
+  var worker: Worker = null
+
+  /**
+   *  This is called when the KinesisReceiver starts and must be non-blocking.
+   *  The KCL creates and manages the receiving/processing thread pool through the Worker.run() 
+   *    method.
+   */
+  override def onStart() {
+    workerId = InetAddress.getLocalHost.getHostAddress() + ":" + UUID.randomUUID()
+    credentialsProvider = new DefaultAWSCredentialsProviderChain()
+    kinesisClientLibConfiguration = new KinesisClientLibConfiguration(appName, streamName,
+      credentialsProvider, workerId).withKinesisEndpoint(endpointUrl)
+      .withInitialPositionInStream(initialPositionInStream).withTaskBackoffTimeMillis(500)
+    recordProcessorFactory = new IRecordProcessorFactory {
+      override def createProcessor: IRecordProcessor = new KinesisRecordProcessor(receiver,
+        workerId, new KinesisCheckpointState(checkpointInterval))
+    }
+    worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration)
+    worker.run()
+    logInfo(s"Started receiver with workerId $workerId")
+  }
+
+  /**
+   *  This is called when the KinesisReceiver stops.
+   *  The KCL worker.shutdown() method stops the receiving/processing threads.
+   *  The KCL will do its best to drain and checkpoint any in-flight records upon shutdown.
+   */
+  override def onStop() {
+    worker.shutdown()
+    logInfo(s"Shut down receiver with workerId $workerId")
+    workerId = null
+    credentialsProvider = null
+    kinesisClientLibConfiguration = null
+    recordProcessorFactory = null
+    worker = null
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
new file mode 100644
index 0000000000000..8ecc2d90160b1
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.util.List
+
+import scala.collection.JavaConversions.asScalaBuffer
+import scala.util.Random
+
+import org.apache.spark.Logging
+
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
+import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
+import com.amazonaws.services.kinesis.model.Record
+
+/**
+ * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
+ * This implementation operates on the Array[Byte] from the KinesisReceiver.
+ * The Kinesis Worker creates an instance of this KinesisRecordProcessor upon startup.
+ *
+ * @param receiver Kinesis receiver
+ * @param workerId for logging purposes
+ * @param checkpointState represents the checkpoint state including the next checkpoint time.
+ *   It's injected here for mocking purposes.
+ */
+private[kinesis] class KinesisRecordProcessor(
+    receiver: KinesisReceiver,
+    workerId: String,
+    checkpointState: KinesisCheckpointState) extends IRecordProcessor with Logging {
+
+  /* shardId to be populated during initialize() */
+  var shardId: String = _
+
+  /**
+   * The Kinesis Client Library calls this method during IRecordProcessor initialization.
+   *
+   * @param shardId assigned by the KCL to this particular RecordProcessor.
+   */
+  override def initialize(shardId: String) {
+    logInfo(s"Initialize:  Initializing workerId $workerId with shardId $shardId")
+    this.shardId = shardId
+  }
+
+  /**
+   * This method is called by the KCL when a batch of records is pulled from the Kinesis stream.
+   * This is the record-processing bridge between the KCL's IRecordProcessor.processRecords()
+   * and Spark Streaming's Receiver.store().
+   *
+   * @param batch list of records from the Kinesis stream shard
+   * @param checkpointer used to update Kinesis when this batch has been processed/stored 
+   *   in the DStream
+   */
+  override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) {
+    if (!receiver.isStopped()) {
+      try {
+        /*
+         * Note:  If we try to store the raw ByteBuffer from record.getData(), the Spark Streaming
+         * Receiver.store(ByteBuffer) attempts to deserialize the ByteBuffer using the
+         *   internally-configured Spark serializer (kryo, etc).
+         * This is not desirable, so we instead store a raw Array[Byte] and decouple
+         *   ourselves from Spark's internal serialization strategy.
+         */
+        batch.foreach(record => receiver.store(record.getData().array()))
+        
+        logDebug(s"Stored:  Worker $workerId stored ${batch.size} records for shardId $shardId")
+
+        /*
+         * Checkpoint the sequence number of the last record successfully processed/stored 
+         *   in the batch.
+         * In this implementation, we're checkpointing after the given checkpointIntervalMillis.
+         * Note that this logic requires that processRecords() be called AND that it's time to 
+         *   checkpoint.  I point this out because there is no background thread running the 
+         *   checkpointer.  Checkpointing is tested and trigger only when a new batch comes in.
+         * If the worker is shutdown cleanly, checkpoint will happen (see shutdown() below).
+         * However, if the worker dies unexpectedly, a checkpoint may not happen.
+         * This could lead to records being processed more than once.
+         */
+        if (checkpointState.shouldCheckpoint()) {
+          /* Perform the checkpoint */
+          KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100)
+
+          /* Update the next checkpoint time */
+          checkpointState.advanceCheckpoint()
+
+          logDebug(s"Checkpoint:  WorkerId $workerId completed checkpoint of ${batch.size}" +
+              s" records for shardId $shardId")
+          logDebug(s"Checkpoint:  Next checkpoint is at " +
+              s" ${checkpointState.checkpointClock.currentTime()} for shardId $shardId")
+        }
+      } catch {
+        case e: Throwable => {
+          /*
+           *  If there is a failure within the batch, the batch will not be checkpointed.
+           *  This will potentially cause records since the last checkpoint to be processed
+           *     more than once.
+           */
+          logError(s"Exception:  WorkerId $workerId encountered and exception while storing " +
+              " or checkpointing a batch for workerId $workerId and shardId $shardId.", e)
+
+          /* Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor.*/
+          throw e
+        }
+      }
+    } else {
+      /* RecordProcessor has been stopped. */
+      logInfo(s"Stopped:  The Spark KinesisReceiver has stopped for workerId $workerId" + 
+          s" and shardId $shardId.  No more records will be processed.")
+    }
+  }
+
+  /**
+   * Kinesis Client Library is shutting down this Worker for 1 of 2 reasons:
+   * 1) the stream is resharding by splitting or merging adjacent shards 
+   *     (ShutdownReason.TERMINATE)
+   * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason 
+   *     (ShutdownReason.ZOMBIE)
+   *
+   * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE
+   * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE)
+   */
+  override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) {
+    logInfo(s"Shutdown:  Shutting down workerId $workerId with reason $reason")
+    reason match {
+      /*
+       * TERMINATE Use Case.  Checkpoint.
+       * Checkpoint to indicate that all records from the shard have been drained and processed.
+       * It's now OK to read from the new shards that resulted from a resharding event.
+       */
+      case ShutdownReason.TERMINATE => 
+        KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100)
+
+      /*
+       * ZOMBIE Use Case.  NoOp.
+       * No checkpoint because other workers may have taken over and already started processing
+       *    the same records.
+       * This may lead to records being processed more than once.
+       */
+      case ShutdownReason.ZOMBIE =>
+
+      /* Unknown reason.  NoOp */
+      case _ =>
+    }
+  }
+}
+
+private[kinesis] object KinesisRecordProcessor extends Logging {
+  /**
+   * Retry the given amount of times with a random backoff time (millis) less than the
+   *   given maxBackOffMillis
+   *
+   * @param expression expression to evalute
+   * @param numRetriesLeft number of retries left
+   * @param maxBackOffMillis: max millis between retries
+   *
+   * @return evaluation of the given expression
+   * @throws Unretryable exception, unexpected exception,
+   *  or any exception that persists after numRetriesLeft reaches 0
+   */
+  @annotation.tailrec
+  def retryRandom[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = {
+    util.Try { expression } match {
+      /* If the function succeeded, evaluate to x. */
+      case util.Success(x) => x
+      /* If the function failed, either retry or throw the exception */
+      case util.Failure(e) => e match {
+        /* Retry:  Throttling or other Retryable exception has occurred */
+        case _: ThrottlingException | _: KinesisClientLibDependencyException if numRetriesLeft > 1
+          => {
+               val backOffMillis = Random.nextInt(maxBackOffMillis)
+               Thread.sleep(backOffMillis)
+               logError(s"Retryable Exception:  Random backOffMillis=${backOffMillis}", e)
+               retryRandom(expression, numRetriesLeft - 1, maxBackOffMillis)
+             }
+        /* Throw:  Shutdown has been requested by the Kinesis Client Library.*/
+        case _: ShutdownException => {
+          logError(s"ShutdownException:  Caught shutdown exception, skipping checkpoint.", e)
+          throw e
+        }
+        /* Throw:  Non-retryable exception has occurred with the Kinesis Client Library */
+        case _: InvalidStateException => {
+          logError(s"InvalidStateException:  Cannot save checkpoint to the DynamoDB table used" +
+              s" by the Amazon Kinesis Client Library.  Table likely doesn't exist.", e)
+          throw e
+        }
+        /* Throw:  Unexpected exception has occurred */
+        case _ => {
+          logError(s"Unexpected, non-retryable exception.", e)
+          throw e
+        }
+      }
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
new file mode 100644
index 0000000000000..713cac0e293c0
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.api.java.JavaReceiverInputDStream
+import org.apache.spark.streaming.api.java.JavaStreamingContext
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
+
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
+
+/**
+ * Helper class to create Amazon Kinesis Input Stream
+ * :: Experimental ::
+ */
+@Experimental
+object KinesisUtils {
+  /**
+   * Create an InputDStream that pulls messages from a Kinesis stream.
+   *
+   * @param ssc    StreamingContext object
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects
+   *
+   * @return ReceiverInputDStream[Array[Byte]]
+   */
+  def createStream(
+      ssc: StreamingContext,
+      streamName: String,
+      endpointUrl: String,
+      checkpointInterval: Duration,
+      initialPositionInStream: InitialPositionInStream,
+      storageLevel: StorageLevel): ReceiverInputDStream[Array[Byte]] = {
+    ssc.receiverStream(new KinesisReceiver(ssc.sc.appName, streamName, endpointUrl,
+        checkpointInterval, initialPositionInStream, storageLevel))
+  }
+
+  /**
+   * Create a Java-friendly InputDStream that pulls messages from a Kinesis stream.
+   *
+   * @param jssc Java StreamingContext object
+   * @param ssc    StreamingContext object
+   * @param streamName   Kinesis stream name
+   * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
+   * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
+   *                                 worker's initial starting position in the stream.
+   *                                 The values are either the beginning of the stream
+   *                                 per Kinesis' limit of 24 hours
+   *                                 (InitialPositionInStream.TRIM_HORIZON) or
+   *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param storageLevel Storage level to use for storing the received objects
+   *
+   * @return JavaReceiverInputDStream[Array[Byte]]
+   */
+  def createStream(
+      jssc: JavaStreamingContext, 
+      streamName: String, 
+      endpointUrl: String, 
+      checkpointInterval: Duration,
+      initialPositionInStream: InitialPositionInStream,
+      storageLevel: StorageLevel): JavaReceiverInputDStream[Array[Byte]] = {
+    jssc.receiverStream(new KinesisReceiver(jssc.ssc.sc.appName, streamName,
+        endpointUrl, checkpointInterval, initialPositionInStream, storageLevel))
+  }
+}
diff --git a/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
new file mode 100644
index 0000000000000..87954a31f60ce
--- /dev/null
+++ b/extras/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis;
+
+import org.apache.spark.storage.StorageLevel;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.LocalJavaStreamingContext;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.junit.Test;
+
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+
+/**
+ * Demonstrate the use of the KinesisUtils Java API
+ */
+public class JavaKinesisStreamSuite extends LocalJavaStreamingContext {
+  @Test
+  public void testKinesisStream() {
+    // Tests the API, does not actually test data receiving
+    JavaDStream<byte[]> kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream",
+        "https://kinesis.us-west-2.amazonaws.com", new Duration(2000), 
+        InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2());
+    
+    ssc.stop();
+  }
+}
diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..e01e049595475
--- /dev/null
+++ b/extras/kinesis-asl/src/test/resources/log4j.properties
@@ -0,0 +1,26 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+log4j.rootCategory=INFO, file
+# log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=false
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.eclipse.jetty=WARN
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
new file mode 100644
index 0000000000000..41dbd64c2b1fa
--- /dev/null
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.nio.ByteBuffer
+
+import scala.collection.JavaConversions.seqAsJavaList
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.Milliseconds
+import org.apache.spark.streaming.Seconds
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.TestSuiteBase
+import org.apache.spark.streaming.util.Clock
+import org.apache.spark.streaming.util.ManualClock
+import org.scalatest.BeforeAndAfter
+import org.scalatest.Matchers
+import org.scalatest.mock.EasyMockSugar
+
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
+import com.amazonaws.services.kinesis.model.Record
+
+/**
+ *  Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor 
+ */
+class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter
+    with EasyMockSugar {
+
+  val app = "TestKinesisReceiver"
+  val stream = "mySparkStream"
+  val endpoint = "endpoint-url"
+  val workerId = "dummyWorkerId"
+  val shardId = "dummyShardId"
+
+  val record1 = new Record()
+  record1.setData(ByteBuffer.wrap("Spark In Action".getBytes()))
+  val record2 = new Record()
+  record2.setData(ByteBuffer.wrap("Learning Spark".getBytes()))
+  val batch = List[Record](record1, record2)
+
+  var receiverMock: KinesisReceiver = _
+  var checkpointerMock: IRecordProcessorCheckpointer = _
+  var checkpointClockMock: ManualClock = _
+  var checkpointStateMock: KinesisCheckpointState = _
+  var currentClockMock: Clock = _
+
+  override def beforeFunction() = {
+    receiverMock = mock[KinesisReceiver]
+    checkpointerMock = mock[IRecordProcessorCheckpointer]
+    checkpointClockMock = mock[ManualClock]
+    checkpointStateMock = mock[KinesisCheckpointState]
+    currentClockMock = mock[Clock]
+  }
+
+  test("kinesis utils api") {
+    val ssc = new StreamingContext(master, framework, batchDuration)
+    // Tests the API, does not actually test data receiving
+    val kinesisStream = KinesisUtils.createStream(ssc, "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", Seconds(2),
+      InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2);
+    ssc.stop()
+  }
+
+  test("process records including store and checkpoint") {
+    val expectedCheckpointIntervalMillis = 10
+    expecting {
+      receiverMock.isStopped().andReturn(false).once()
+      receiverMock.store(record1.getData().array()).once()
+      receiverMock.store(record2.getData().array()).once()
+      checkpointStateMock.shouldCheckpoint().andReturn(true).once()
+      checkpointerMock.checkpoint().once()
+      checkpointStateMock.advanceCheckpoint().once()
+    }
+    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
+          checkpointStateMock)
+      recordProcessor.processRecords(batch, checkpointerMock)
+    }
+  }
+
+  test("shouldn't store and checkpoint when receiver is stopped") {
+    expecting {
+      receiverMock.isStopped().andReturn(true).once()
+    }
+    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
+          checkpointStateMock)
+      recordProcessor.processRecords(batch, checkpointerMock)
+    }
+  }
+
+  test("shouldn't checkpoint when exception occurs during store") {
+    expecting {
+      receiverMock.isStopped().andReturn(false).once()
+      receiverMock.store(record1.getData().array()).andThrow(new RuntimeException()).once()
+    }
+    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
+      intercept[RuntimeException] {
+        val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
+            checkpointStateMock)
+        recordProcessor.processRecords(batch, checkpointerMock)
+      }
+    }
+  }
+
+  test("should set checkpoint time to currentTime + checkpoint interval upon instantiation") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+    val checkpointIntervalMillis = 10
+    val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
+    assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+    }
+  }
+
+  test("should checkpoint if we have exceeded the checkpoint interval") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+      val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MinValue), currentClockMock)
+      assert(checkpointState.shouldCheckpoint())
+    }
+  }
+
+  test("shouldn't checkpoint if we have not exceeded the checkpoint interval") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+      val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MaxValue), currentClockMock)
+      assert(!checkpointState.shouldCheckpoint())
+    }
+  }
+
+  test("should add to time when advancing checkpoint") {
+    expecting {
+      currentClockMock.currentTime().andReturn(0).once()
+    }
+    whenExecuting(currentClockMock) {
+      val checkpointIntervalMillis = 10
+      val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
+      assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+      checkpointState.advanceCheckpoint()
+      assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis))
+    }
+  }
+
+  test("shutdown should checkpoint if the reason is TERMINATE") {
+    expecting {
+      checkpointerMock.checkpoint().once()
+    }
+    whenExecuting(checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, 
+          checkpointStateMock)
+      val reason = ShutdownReason.TERMINATE
+      recordProcessor.shutdown(checkpointerMock, reason)
+    }
+  }
+
+  test("shutdown should not checkpoint if the reason is something other than TERMINATE") {
+    expecting {
+    }
+    whenExecuting(checkpointerMock, checkpointStateMock) {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, 
+          checkpointStateMock)
+      recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE)
+      recordProcessor.shutdown(checkpointerMock, null)
+    }
+  }
+
+  test("retry success on first attempt") {
+    val expectedIsStopped = false
+    expecting {
+      receiverMock.isStopped().andReturn(expectedIsStopped).once()
+    }
+    whenExecuting(receiverMock) {
+      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+      assert(actualVal == expectedIsStopped)
+    }
+  }
+
+  test("retry success on second attempt after a Kinesis throttling exception") {
+    val expectedIsStopped = false
+    expecting {
+      receiverMock.isStopped().andThrow(new ThrottlingException("error message"))
+        .andReturn(expectedIsStopped).once()
+    }
+    whenExecuting(receiverMock) {
+      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+      assert(actualVal == expectedIsStopped)
+    }
+  }
+
+  test("retry success on second attempt after a Kinesis dependency exception") {
+    val expectedIsStopped = false
+    expecting {
+      receiverMock.isStopped().andThrow(new KinesisClientLibDependencyException("error message"))
+        .andReturn(expectedIsStopped).once()
+    }
+    whenExecuting(receiverMock) {
+      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+      assert(actualVal == expectedIsStopped)
+    }
+  }
+
+  test("retry failed after a shutdown exception") {
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new ShutdownException("error message")).once()
+    }
+    whenExecuting(checkpointerMock) {
+      intercept[ShutdownException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+    }
+  }
+
+  test("retry failed after an invalid state exception") {
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new InvalidStateException("error message")).once()
+    }
+    whenExecuting(checkpointerMock) {
+      intercept[InvalidStateException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+    }
+  }
+
+  test("retry failed after unexpected exception") {
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new RuntimeException("error message")).once()
+    }
+    whenExecuting(checkpointerMock) {
+      intercept[RuntimeException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+    }
+  }
+
+  test("retry failed after exhausing all retries") {
+    val expectedErrorMessage = "final try error message"
+    expecting {
+      checkpointerMock.checkpoint().andThrow(new ThrottlingException("error message"))
+        .andThrow(new ThrottlingException(expectedErrorMessage)).once()
+    }
+    whenExecuting(checkpointerMock) {
+      val exception = intercept[RuntimeException] {
+        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
+      }
+      exception.getMessage().shouldBe(expectedErrorMessage)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 7d912737b8f0b..1d5d3762ed8e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -19,6 +19,8 @@ package org.apache.spark.mllib.api.python
 
 import java.nio.{ByteBuffer, ByteOrder}
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.mllib.classification._
@@ -29,6 +31,11 @@ import org.apache.spark.mllib.linalg.{Matrix, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.random.{RandomRDDGenerators => RG}
 import org.apache.spark.mllib.recommendation._
 import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.tree.configuration.Strategy
+import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.util.MLUtils
@@ -472,6 +479,76 @@ class PythonMLLibAPI extends Serializable {
     ALS.trainImplicit(ratings, rank, iterations, lambda, blocks, alpha)
   }
 
+  /**
+   * Java stub for Python mllib DecisionTree.train().
+   * This stub returns a handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on exit;
+   * see the Py4J documentation.
+   * @param dataBytesJRDD  Training data
+   * @param categoricalFeaturesInfoJMap  Categorical features info, as Java map
+   */
+  def trainDecisionTreeModel(
+      dataBytesJRDD: JavaRDD[Array[Byte]],
+      algoStr: String,
+      numClasses: Int,
+      categoricalFeaturesInfoJMap: java.util.Map[Int, Int],
+      impurityStr: String,
+      maxDepth: Int,
+      maxBins: Int): DecisionTreeModel = {
+
+    val data = dataBytesJRDD.rdd.map(deserializeLabeledPoint)
+
+    val algo: Algo = algoStr match {
+      case "classification" => Classification
+      case "regression" => Regression
+      case _ => throw new IllegalArgumentException(s"Bad algoStr parameter: $algoStr")
+    }
+    val impurity: Impurity = impurityStr match {
+      case "gini" => Gini
+      case "entropy" => Entropy
+      case "variance" => Variance
+      case _ => throw new IllegalArgumentException(s"Bad impurityStr parameter: $impurityStr")
+    }
+
+    val strategy = new Strategy(
+      algo = algo,
+      impurity = impurity,
+      maxDepth = maxDepth,
+      numClassesForClassification = numClasses,
+      maxBins = maxBins,
+      categoricalFeaturesInfo = categoricalFeaturesInfoJMap.asScala.toMap)
+
+    DecisionTree.train(data, strategy)
+  }
+
+  /**
+   * Predict the label of the given data point.
+   * This is a Java stub for python DecisionTreeModel.predict()
+   *
+   * @param featuresBytes Serialized feature vector for data point
+   * @return predicted label
+   */
+  def predictDecisionTreeModel(
+      model: DecisionTreeModel,
+      featuresBytes: Array[Byte]): Double = {
+    val features: Vector = deserializeDoubleVector(featuresBytes)
+    model.predict(features)
+  }
+
+  /**
+   * Predict the labels of the given data points.
+   * This is a Java stub for python DecisionTreeModel.predict()
+   *
+   * @param dataJRDD A JavaRDD with serialized feature vectors
+   * @return JavaRDD of serialized predictions
+   */
+  def predictDecisionTreeModel(
+      model: DecisionTreeModel,
+      dataJRDD: JavaRDD[Array[Byte]]): JavaRDD[Array[Byte]] = {
+    val data = dataJRDD.rdd.map(xBytes => deserializeDoubleVector(xBytes))
+    model.predict(data).map(serializeDouble)
+  }
+
   /**
    * Java stub for mllib Statistics.corr(X: RDD[Vector], method: String).
    * Returns the correlation matrix serialized into a byte array understood by deserializers in
@@ -597,4 +674,5 @@ class PythonMLLibAPI extends Serializable {
     val s = getSeedOrDefault(seed)
     RG.poissonVectorRDD(jsc.sc, mean, numRows, numCols, parts, s).map(serializeDoubleVector)
   }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index 5c65b537b6867..fdad4f029aa99 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -56,7 +56,8 @@ class Strategy (
   if (algo == Classification) {
     require(numClassesForClassification >= 2)
   }
-  val isMulticlassClassification = numClassesForClassification > 2
+  val isMulticlassClassification =
+    algo == Classification && numClassesForClassification > 2
   val isMulticlassWithCategoricalFeatures
     = isMulticlassClassification && (categoricalFeaturesInfo.size > 0)
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 546a132559326..8665a00f3b356 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -48,7 +48,8 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
       requiredMSE: Double) {
     val predictions = input.map(x => model.predict(x.features))
     val squaredError = predictions.zip(input).map { case (prediction, expected) =>
-      (prediction - expected.label) * (prediction - expected.label)
+      val err = prediction - expected.label
+      err * err
     }.sum
     val mse = squaredError / input.length
     assert(mse <= requiredMSE)
diff --git a/pom.xml b/pom.xml
index ae97bf03c53a2..cc9377cec2a07 100644
--- a/pom.xml
+++ b/pom.xml
@@ -134,6 +134,8 @@
     <codahale.metrics.version>3.0.0</codahale.metrics.version>
     <avro.version>1.7.6</avro.version>
     <jets3t.version>0.7.1</jets3t.version>
+    <aws.java.sdk.version>1.8.3</aws.java.sdk.version>
+    <aws.kinesis.client.version>1.1.0</aws.kinesis.client.version>
 
     <PermGen>64m</PermGen>
     <MaxPermGen>512m</MaxPermGen>
@@ -868,10 +870,11 @@
             <filereports>${project.build.directory}/SparkTestSuite.txt</filereports>
             <argLine>-Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
             <stderr/>
-            <environmentVariables>
-              <SPARK_HOME>${session.executionRootDirectory}</SPARK_HOME>
-              <SPARK_TESTING>1</SPARK_TESTING>
-            </environmentVariables>
+            <systemProperties>
+              <java.awt.headless>true</java.awt.headless>
+              <spark.test.home>${session.executionRootDirectory}</spark.test.home>
+              <spark.testing>1</spark.testing>
+            </systemProperties>
           </configuration>
           <executions>
             <execution>
@@ -1011,6 +1014,14 @@
       </modules>
     </profile>
 
+    <!-- Kinesis integration is not included by default due to ASL-licensed code -->
+    <profile>
+      <id>kinesis-asl</id>
+      <modules>
+        <module>extras/kinesis-asl</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>java8-tests</id>
       <build>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 1d7cc6dd6aef3..aac621fe53938 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -37,8 +37,8 @@ object BuildCommons {
       "spark", "sql", "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka",
       "streaming-mqtt", "streaming-twitter", "streaming-zeromq").map(ProjectRef(buildLocation, _))
 
-  val optionallyEnabledProjects@Seq(yarn, yarnStable, yarnAlpha, java8Tests, sparkGangliaLgpl) =
-    Seq("yarn", "yarn-stable", "yarn-alpha", "java8-tests", "ganglia-lgpl")
+  val optionallyEnabledProjects@Seq(yarn, yarnStable, yarnAlpha, java8Tests, sparkGangliaLgpl, sparkKinesisAsl) =
+    Seq("yarn", "yarn-stable", "yarn-alpha", "java8-tests", "ganglia-lgpl", "kinesis-asl")
       .map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(assembly, examples) = Seq("assembly", "examples")
@@ -62,7 +62,7 @@ object SparkBuild extends PomBuild {
     var isAlphaYarn = false
     var profiles: mutable.Seq[String] = mutable.Seq.empty
     if (Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined) {
-      println("NOTE: SPARK_GANGLIA_LGPL is deprecated, please use -Pganglia-lgpl flag.")
+      println("NOTE: SPARK_GANGLIA_LGPL is deprecated, please use -Pspark-ganglia-lgpl flag.")
       profiles ++= Seq("spark-ganglia-lgpl")
     }
     if (Properties.envOrNone("SPARK_HIVE").isDefined) {
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index fca23111d3c2b..1503c567f13ea 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -350,23 +350,35 @@ def _copyto(array, buffer, offset, shape, dtype):
     temp_array[...] = array
 
 
-def _get_unmangled_rdd(data, serializer):
+def _get_unmangled_rdd(data, serializer, cache=True):
+    """
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
     dataBytes = data.map(serializer)
     dataBytes._bypass_serializer = True
-    dataBytes.cache()  # TODO: users should unpersist() this later!
+    if cache:
+        dataBytes.cache()
     return dataBytes
 
 
-# Map a pickled Python RDD of Python dense or sparse vectors to a Java RDD of
-# _serialized_double_vectors
-def _get_unmangled_double_vector_rdd(data):
-    return _get_unmangled_rdd(data, _serialize_double_vector)
+def _get_unmangled_double_vector_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of Python dense or sparse vectors to a Java RDD of
+    _serialized_double_vectors.
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    return _get_unmangled_rdd(data, _serialize_double_vector, cache)
 
 
-# Map a pickled Python RDD of LabeledPoint to a Java RDD of
-# _serialized_labeled_points
-def _get_unmangled_labeled_point_rdd(data):
-    return _get_unmangled_rdd(data, _serialize_labeled_point)
+def _get_unmangled_labeled_point_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points.
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    return _get_unmangled_rdd(data, _serialize_labeled_point, cache)
 
 
 # Common functions for dealing with and training linear models
@@ -388,7 +400,7 @@ def _linear_predictor_typecheck(x, coeffs):
         if x.size != coeffs.shape[0]:
             raise RuntimeError("Got sparse vector of size %d; wanted %d" % (
                 x.size, coeffs.shape[0]))
-    elif (type(x) == RDD):
+    elif isinstance(x, RDD):
         raise RuntimeError("Bulk predict not yet supported.")
     else:
         raise TypeError(
diff --git a/python/pyspark/mllib/_common.py.orig b/python/pyspark/mllib/_common.py.orig
new file mode 100644
index 0000000000000..e55270733f55b
--- /dev/null
+++ b/python/pyspark/mllib/_common.py.orig
@@ -0,0 +1,572 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import struct
+import numpy
+from numpy import ndarray, float64, int64, int32, array_equal, array
+from pyspark import SparkContext, RDD
+from pyspark.mllib.linalg import SparseVector
+from pyspark.serializers import Serializer
+
+
+"""
+Common utilities shared throughout MLlib, primarily for dealing with
+different data types. These include:
+- Serialization utilities to / from byte arrays that Java can handle
+- Serializers for other data types, like ALS Rating objects
+- Common methods for linear models
+- Methods to deal with the different vector types we support, such as
+  SparseVector and scipy.sparse matrices.
+"""
+
+
+# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
+# such as _dot and _serialize_double_vector, start to support scipy.sparse
+# matrices.
+
+_have_scipy = False
+_scipy_issparse = None
+try:
+    import scipy.sparse
+    _have_scipy = True
+    _scipy_issparse = scipy.sparse.issparse
+except:
+    # No SciPy in environment, but that's okay
+    pass
+
+
+# Serialization functions to and from Scala. These use the following formats, understood
+# by the PythonMLLibAPI class in Scala:
+#
+# Dense double vector format:
+#
+# [1-byte 1] [4-byte length] [length*8 bytes of data]
+#
+# Sparse double vector format:
+#
+# [1-byte 2] [4-byte length] [4-byte nonzeros] [nonzeros*4 bytes of indices] \
+# [nonzeros*8 bytes of values]
+#
+# Double matrix format:
+#
+# [1-byte 3] [4-byte rows] [4-byte cols] [rows*cols*8 bytes of data]
+#
+# LabeledPoint format:
+#
+# [1-byte 4] [8-byte label] [dense or sparse vector]
+#
+# This is all in machine-endian.  That means that the Java interpreter and the
+# Python interpreter must agree on what endian the machine is.
+
+
+DENSE_VECTOR_MAGIC = 1
+SPARSE_VECTOR_MAGIC = 2
+DENSE_MATRIX_MAGIC = 3
+LABELED_POINT_MAGIC = 4
+
+
+def _deserialize_numpy_array(shape, ba, offset, dtype=float64):
+    """
+    Deserialize a numpy array of the given type from an offset in
+    bytearray ba, assigning it the given shape.
+
+    >>> x = array([1.0, 2.0, 3.0, 4.0, 5.0])
+    >>> array_equal(x, _deserialize_numpy_array(x.shape, x.data, 0))
+    True
+    >>> x = array([1.0, 2.0, 3.0, 4.0]).reshape(2,2)
+    >>> array_equal(x, _deserialize_numpy_array(x.shape, x.data, 0))
+    True
+    >>> x = array([1, 2, 3], dtype=int32)
+    >>> array_equal(x, _deserialize_numpy_array(x.shape, x.data, 0, dtype=int32))
+    True
+    """
+    ar = ndarray(shape=shape, buffer=ba, offset=offset, dtype=dtype, order='C')
+    return ar.copy()
+
+
+def _serialize_double(d):
+    """
+    Serialize a double (float or numpy.float64) into a mutually understood format.
+    """
+    if type(d) == float or type(d) == float64 or type(d) == int or type(d) == long:
+        d = float64(d)
+        ba = bytearray(8)
+        _copyto(d, buffer=ba, offset=0, shape=[1], dtype=float64)
+        return ba
+    else:
+        raise TypeError("_serialize_double called on non-float input")
+
+
+def _serialize_double_vector(v):
+    """
+    Serialize a double vector into a mutually understood format.
+
+    Note: we currently do not use a magic byte for double for storage
+    efficiency. This should be reconsidered when we add Ser/De for other
+    8-byte types (e.g. Long), for safety. The corresponding deserializer,
+    _deserialize_double, needs to be modified as well if the serialization
+    scheme changes.
+
+    >>> x = array([1,2,3])
+    >>> y = _deserialize_double_vector(_serialize_double_vector(x))
+    >>> array_equal(y, array([1.0, 2.0, 3.0]))
+    True
+    """
+    v = _convert_vector(v)
+    if type(v) == ndarray:
+        return _serialize_dense_vector(v)
+    elif type(v) == SparseVector:
+        return _serialize_sparse_vector(v)
+    else:
+        raise TypeError("_serialize_double_vector called on a %s; "
+                        "wanted ndarray or SparseVector" % type(v))
+
+
+def _serialize_dense_vector(v):
+    """Serialize a dense vector given as a NumPy array."""
+    if v.ndim != 1:
+        raise TypeError("_serialize_double_vector called on a %ddarray; "
+                        "wanted a 1darray" % v.ndim)
+    if v.dtype != float64:
+        if numpy.issubdtype(v.dtype, numpy.complex):
+            raise TypeError("_serialize_double_vector called on an ndarray of %s; "
+                            "wanted ndarray of float64" % v.dtype)
+        v = v.astype(float64)
+    length = v.shape[0]
+    ba = bytearray(5 + 8 * length)
+    ba[0] = DENSE_VECTOR_MAGIC
+    length_bytes = ndarray(shape=[1], buffer=ba, offset=1, dtype=int32)
+    length_bytes[0] = length
+    _copyto(v, buffer=ba, offset=5, shape=[length], dtype=float64)
+    return ba
+
+
+def _serialize_sparse_vector(v):
+    """Serialize a pyspark.mllib.linalg.SparseVector."""
+    nonzeros = len(v.indices)
+    ba = bytearray(9 + 12 * nonzeros)
+    ba[0] = SPARSE_VECTOR_MAGIC
+    header = ndarray(shape=[2], buffer=ba, offset=1, dtype=int32)
+    header[0] = v.size
+    header[1] = nonzeros
+    _copyto(v.indices, buffer=ba, offset=9, shape=[nonzeros], dtype=int32)
+    values_offset = 9 + 4 * nonzeros
+    _copyto(v.values, buffer=ba, offset=values_offset,
+            shape=[nonzeros], dtype=float64)
+    return ba
+
+
+def _deserialize_double(ba, offset=0):
+    """Deserialize a double from a mutually understood format.
+
+    >>> import sys
+    >>> _deserialize_double(_serialize_double(123.0)) == 123.0
+    True
+    >>> _deserialize_double(_serialize_double(float64(0.0))) == 0.0
+    True
+    >>> _deserialize_double(_serialize_double(1)) == 1.0
+    True
+    >>> _deserialize_double(_serialize_double(1L)) == 1.0
+    True
+    >>> x = sys.float_info.max
+    >>> _deserialize_double(_serialize_double(sys.float_info.max)) == x
+    True
+    >>> y = float64(sys.float_info.max)
+    >>> _deserialize_double(_serialize_double(sys.float_info.max)) == y
+    True
+    """
+    if type(ba) != bytearray:
+        raise TypeError(
+            "_deserialize_double called on a %s; wanted bytearray" % type(ba))
+    if len(ba) - offset != 8:
+        raise TypeError(
+            "_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
+    return struct.unpack("d", ba[offset:])[0]
+
+
+def _deserialize_double_vector(ba, offset=0):
+    """Deserialize a double vector from a mutually understood format.
+
+    >>> x = array([1.0, 2.0, 3.0, 4.0, -1.0, 0.0, -0.0])
+    >>> array_equal(x, _deserialize_double_vector(_serialize_double_vector(x)))
+    True
+    >>> s = SparseVector(4, [1, 3], [3.0, 5.5])
+    >>> s == _deserialize_double_vector(_serialize_double_vector(s))
+    True
+    """
+    if type(ba) != bytearray:
+        raise TypeError("_deserialize_double_vector called on a %s; "
+                        "wanted bytearray" % type(ba))
+    nb = len(ba) - offset
+    if nb < 5:
+        raise TypeError("_deserialize_double_vector called on a %d-byte array, "
+                        "which is too short" % nb)
+    if ba[offset] == DENSE_VECTOR_MAGIC:
+        return _deserialize_dense_vector(ba, offset)
+    elif ba[offset] == SPARSE_VECTOR_MAGIC:
+        return _deserialize_sparse_vector(ba, offset)
+    else:
+        raise TypeError("_deserialize_double_vector called on bytearray "
+                        "with wrong magic")
+
+
+def _deserialize_dense_vector(ba, offset=0):
+    """Deserialize a dense vector into a numpy array."""
+    nb = len(ba) - offset
+    if nb < 5:
+        raise TypeError("_deserialize_dense_vector called on a %d-byte array, "
+                        "which is too short" % nb)
+    length = ndarray(shape=[1], buffer=ba, offset=offset + 1, dtype=int32)[0]
+    if nb < 8 * length + 5:
+        raise TypeError("_deserialize_dense_vector called on bytearray "
+                        "with wrong length")
+    return _deserialize_numpy_array([length], ba, offset + 5)
+
+
+def _deserialize_sparse_vector(ba, offset=0):
+    """Deserialize a sparse vector into a MLlib SparseVector object."""
+    nb = len(ba) - offset
+    if nb < 9:
+        raise TypeError("_deserialize_sparse_vector called on a %d-byte array, "
+                        "which is too short" % nb)
+    header = ndarray(shape=[2], buffer=ba, offset=offset + 1, dtype=int32)
+    size = header[0]
+    nonzeros = header[1]
+    if nb < 9 + 12 * nonzeros:
+        raise TypeError("_deserialize_sparse_vector called on bytearray "
+                        "with wrong length")
+    indices = _deserialize_numpy_array([nonzeros], ba, offset + 9, dtype=int32)
+    values = _deserialize_numpy_array(
+        [nonzeros], ba, offset + 9 + 4 * nonzeros, dtype=float64)
+    return SparseVector(int(size), indices, values)
+
+
+def _serialize_double_matrix(m):
+    """Serialize a double matrix into a mutually understood format."""
+    if (type(m) == ndarray and m.ndim == 2):
+        if m.dtype != float64:
+            if numpy.issubdtype(m.dtype, numpy.complex):
+                raise TypeError("_serialize_double_matrix called on an ndarray of %s; "
+                                "wanted ndarray of float64" % m.dtype)
+            m = m.astype(float64)
+        rows = m.shape[0]
+        cols = m.shape[1]
+        ba = bytearray(9 + 8 * rows * cols)
+        ba[0] = DENSE_MATRIX_MAGIC
+        lengths = ndarray(shape=[3], buffer=ba, offset=1, dtype=int32)
+        lengths[0] = rows
+        lengths[1] = cols
+        _copyto(m, buffer=ba, offset=9, shape=[rows, cols], dtype=float64)
+        return ba
+    else:
+        raise TypeError("_serialize_double_matrix called on a "
+                        "non-double-matrix")
+
+
+def _deserialize_double_matrix(ba):
+    """Deserialize a double matrix from a mutually understood format."""
+    if type(ba) != bytearray:
+        raise TypeError("_deserialize_double_matrix called on a %s; "
+                        "wanted bytearray" % type(ba))
+    if len(ba) < 9:
+        raise TypeError("_deserialize_double_matrix called on a %d-byte array, "
+                        "which is too short" % len(ba))
+    if ba[0] != DENSE_MATRIX_MAGIC:
+        raise TypeError("_deserialize_double_matrix called on bytearray "
+                        "with wrong magic")
+    lengths = ndarray(shape=[2], buffer=ba, offset=1, dtype=int32)
+    rows = lengths[0]
+    cols = lengths[1]
+    if (len(ba) != 8 * rows * cols + 9):
+        raise TypeError("_deserialize_double_matrix called on bytearray "
+                        "with wrong length")
+    return _deserialize_numpy_array([rows, cols], ba, 9)
+
+
+def _serialize_labeled_point(p):
+    """
+    Serialize a LabeledPoint with a features vector of any type.
+
+    >>> from pyspark.mllib.regression import LabeledPoint
+    >>> dp0 = LabeledPoint(0.5, array([1.0, 2.0, 3.0, 4.0, -1.0, 0.0, -0.0]))
+    >>> dp1 = _deserialize_labeled_point(_serialize_labeled_point(dp0))
+    >>> dp1.label == dp0.label
+    True
+    >>> array_equal(dp1.features, dp0.features)
+    True
+    >>> sp0 = LabeledPoint(0.0, SparseVector(4, [1, 3], [3.0, 5.5]))
+    >>> sp1 = _deserialize_labeled_point(_serialize_labeled_point(sp0))
+    >>> sp1.label == sp1.label
+    True
+    >>> sp1.features == sp0.features
+    True
+    """
+    from pyspark.mllib.regression import LabeledPoint
+    serialized_features = _serialize_double_vector(p.features)
+    header = bytearray(9)
+    header[0] = LABELED_POINT_MAGIC
+    header_float = ndarray(shape=[1], buffer=header, offset=1, dtype=float64)
+    header_float[0] = p.label
+    return header + serialized_features
+
+
+def _deserialize_labeled_point(ba, offset=0):
+    """Deserialize a LabeledPoint from a mutually understood format."""
+    from pyspark.mllib.regression import LabeledPoint
+    if type(ba) != bytearray:
+        raise TypeError("Expecting a bytearray but got %s" % type(ba))
+    if ba[offset] != LABELED_POINT_MAGIC:
+        raise TypeError("Expecting magic number %d but got %d" %
+                        (LABELED_POINT_MAGIC, ba[0]))
+    label = ndarray(shape=[1], buffer=ba, offset=offset + 1, dtype=float64)[0]
+    features = _deserialize_double_vector(ba, offset + 9)
+    return LabeledPoint(label, features)
+
+
+def _copyto(array, buffer, offset, shape, dtype):
+    """
+    Copy the contents of a vector to a destination bytearray at the
+    given offset.
+
+    TODO: In the future this could use numpy.copyto on NumPy 1.7+, but
+    we should benchmark that to see whether it provides a benefit.
+    """
+    temp_array = ndarray(
+        shape=shape, buffer=buffer, offset=offset, dtype=dtype, order='C')
+    temp_array[...] = array
+
+
+def _get_unmangled_rdd(data, serializer, cache=True):
+    """
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    dataBytes = data.map(serializer)
+    dataBytes._bypass_serializer = True
+    if cache:
+        dataBytes.cache()
+    return dataBytes
+
+
+def _get_unmangled_double_vector_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of Python dense or sparse vectors to a Java RDD of
+    _serialized_double_vectors.
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    return _get_unmangled_rdd(data, _serialize_double_vector, cache)
+
+
+<<<<<<< HEAD
+# Map a pickled Python RDD of LabeledPoint to a Java RDD of
+# _serialized_labeled_points
+def _get_unmangled_labeled_point_rdd(data):
+    return _get_unmangled_rdd(data, _serialize_labeled_point)
+=======
+def _get_unmangled_labeled_point_rdd(data, cache=True):
+    """
+    Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points.
+    :param cache:  If True, the serialized RDD is cached.  (default = True)
+                   WARNING: Users should unpersist() this later!
+    """
+    return _get_unmangled_rdd(data, _serialize_labeled_point, cache)
+>>>>>>> 3dc55fdf450b4237f7c592fce56d1467fd206366
+
+
+# Common functions for dealing with and training linear models
+
+def _linear_predictor_typecheck(x, coeffs):
+    """
+    Check that x is a one-dimensional vector of the right shape.
+    This is a temporary hackaround until we actually implement bulk predict.
+    """
+    x = _convert_vector(x)
+    if type(x) == ndarray:
+        if x.ndim == 1:
+            if x.shape != coeffs.shape:
+                raise RuntimeError("Got array of %d elements; wanted %d" % (
+                    numpy.shape(x)[0], coeffs.shape[0]))
+        else:
+            raise RuntimeError("Bulk predict not yet supported.")
+    elif type(x) == SparseVector:
+        if x.size != coeffs.shape[0]:
+            raise RuntimeError("Got sparse vector of size %d; wanted %d" % (
+                x.size, coeffs.shape[0]))
+    elif isinstance(x, RDD):
+        raise RuntimeError("Bulk predict not yet supported.")
+    else:
+        raise TypeError(
+            "Argument of type " + type(x).__name__ + " unsupported")
+
+
+# If we weren't given initial weights, take a zero vector of the appropriate
+# length.
+def _get_initial_weights(initial_weights, data):
+    if initial_weights is None:
+        initial_weights = _convert_vector(data.first().features)
+        if type(initial_weights) == ndarray:
+            if initial_weights.ndim != 1:
+                raise TypeError("At least one data element has "
+                                + initial_weights.ndim + " dimensions, which is not 1")
+            initial_weights = numpy.zeros([initial_weights.shape[0]])
+        elif type(initial_weights) == SparseVector:
+            initial_weights = numpy.zeros([initial_weights.size])
+    return initial_weights
+
+
+# train_func should take two parameters, namely data and initial_weights, and
+# return the result of a call to the appropriate JVM stub.
+# _regression_train_wrapper is responsible for setup and error checking.
+def _regression_train_wrapper(sc, train_func, klass, data, initial_weights):
+    initial_weights = _get_initial_weights(initial_weights, data)
+    dataBytes = _get_unmangled_labeled_point_rdd(data)
+    ans = train_func(dataBytes, _serialize_double_vector(initial_weights))
+    if len(ans) != 2:
+        raise RuntimeError("JVM call result had unexpected length")
+    elif type(ans[0]) != bytearray:
+        raise RuntimeError("JVM call result had first element of type "
+                           + type(ans[0]).__name__ + " which is not bytearray")
+    elif type(ans[1]) != float:
+        raise RuntimeError("JVM call result had second element of type "
+                           + type(ans[0]).__name__ + " which is not float")
+    return klass(_deserialize_double_vector(ans[0]), ans[1])
+
+
+# Functions for serializing ALS Rating objects and tuples
+
+def _serialize_rating(r):
+    ba = bytearray(16)
+    intpart = ndarray(shape=[2], buffer=ba, dtype=int32)
+    doublepart = ndarray(shape=[1], buffer=ba, dtype=float64, offset=8)
+    intpart[0], intpart[1], doublepart[0] = r
+    return ba
+
+
+class RatingDeserializer(Serializer):
+
+    def loads(self, stream):
+        length = struct.unpack("!i", stream.read(4))[0]
+        ba = stream.read(length)
+        res = ndarray(shape=(3, ), buffer=ba, dtype=float64, offset=4)
+        return int(res[0]), int(res[1]), res[2]
+
+    def load_stream(self, stream):
+        while True:
+            try:
+                yield self.loads(stream)
+            except struct.error:
+                return
+            except EOFError:
+                return
+
+
+def _serialize_tuple(t):
+    ba = bytearray(8)
+    intpart = ndarray(shape=[2], buffer=ba, dtype=int32)
+    intpart[0], intpart[1] = t
+    return ba
+
+
+# Vector math functions that support all of our vector types
+
+def _convert_vector(vec):
+    """
+    Convert a vector to a format we support internally. This does
+    the following:
+
+    * For dense NumPy vectors (ndarray), returns them as is
+    * For our SparseVector class, returns that as is
+    * For Python lists, converts them to NumPy vectors
+    * For scipy.sparse.*_matrix column vectors, converts them to
+      our own SparseVector type.
+
+    This should be called before passing any data to our algorithms
+    or attempting to serialize it to Java.
+    """
+    if type(vec) == ndarray or type(vec) == SparseVector:
+        return vec
+    elif type(vec) == list:
+        return array(vec, dtype=float64)
+    elif _have_scipy:
+        if _scipy_issparse(vec):
+            assert vec.shape[1] == 1, "Expected column vector"
+            csc = vec.tocsc()
+            return SparseVector(vec.shape[0], csc.indices, csc.data)
+    raise TypeError(
+        "Expected NumPy array, SparseVector, or scipy.sparse matrix")
+
+
+def _squared_distance(v1, v2):
+    """
+    Squared distance of two NumPy or sparse vectors.
+
+    >>> dense1 = array([1., 2.])
+    >>> sparse1 = SparseVector(2, [0, 1], [1., 2.])
+    >>> dense2 = array([2., 1.])
+    >>> sparse2 = SparseVector(2, [0, 1], [2., 1.])
+    >>> _squared_distance(dense1, dense2)
+    2.0
+    >>> _squared_distance(dense1, sparse2)
+    2.0
+    >>> _squared_distance(sparse1, dense2)
+    2.0
+    >>> _squared_distance(sparse1, sparse2)
+    2.0
+    """
+    v1 = _convert_vector(v1)
+    v2 = _convert_vector(v2)
+    if type(v1) == ndarray and type(v2) == ndarray:
+        diff = v1 - v2
+        return numpy.dot(diff, diff)
+    elif type(v1) == ndarray:
+        return v2.squared_distance(v1)
+    else:
+        return v1.squared_distance(v2)
+
+
+def _dot(vec, target):
+    """
+    Compute the dot product of a vector of the types we support
+    (Numpy array, list, SparseVector, or SciPy sparse) and a target
+    NumPy array that is either 1- or 2-dimensional. Equivalent to
+    calling numpy.dot of the two vectors, but for SciPy ones, we
+    have to transpose them because they're column vectors.
+    """
+    if type(vec) == ndarray:
+        return numpy.dot(vec, target)
+    elif type(vec) == SparseVector:
+        return vec.dot(target)
+    elif type(vec) == list:
+        return numpy.dot(_convert_vector(vec), target)
+    else:
+        return vec.transpose().dot(target)[0]
+
+
+def _test():
+    import doctest
+    globs = globals().copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index d5dd07299269b..24843d3c4eaf2 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -108,6 +108,7 @@ def test_clustering(self):
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(0.0, [1, 0, 0]),
             LabeledPoint(1.0, [0, 1, 1]),
@@ -135,9 +136,19 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[2]) <= 0)
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+        dt_model = \
+            DecisionTree.trainClassifier(rdd, numClasses=2,
+                                         categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
     def test_regression(self):
         from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
             RidgeRegressionWithSGD
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(-1.0, [0, -1]),
             LabeledPoint(1.0, [0, 1]),
@@ -165,6 +176,14 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[2]) <= 0)
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+        dt_model = \
+            DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
 
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
@@ -243,6 +262,7 @@ def test_clustering(self):
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
             LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
@@ -270,9 +290,18 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[2]) <= 0)
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
+                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
     def test_regression(self):
         from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
             RidgeRegressionWithSGD
+        from pyspark.mllib.tree import DecisionTree
         data = [
             LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
             LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
@@ -300,6 +329,13 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[2]) <= 0)
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
+        categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+        self.assertTrue(dt_model.predict(features[0]) <= 0)
+        self.assertTrue(dt_model.predict(features[1]) > 0)
+        self.assertTrue(dt_model.predict(features[2]) <= 0)
+        self.assertTrue(dt_model.predict(features[3]) > 0)
+
 
 if __name__ == "__main__":
     if not _have_scipy:
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
new file mode 100644
index 0000000000000..1e0006df75ac6
--- /dev/null
+++ b/python/pyspark/mllib/tree.py
@@ -0,0 +1,225 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from py4j.java_collections import MapConverter
+
+from pyspark import SparkContext, RDD
+from pyspark.mllib._common import \
+    _get_unmangled_rdd, _get_unmangled_double_vector_rdd, _serialize_double_vector, \
+    _deserialize_labeled_point, _get_unmangled_labeled_point_rdd, \
+    _deserialize_double
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.serializers import NoOpSerializer
+
+class DecisionTreeModel(object):
+    """
+    A decision tree model for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified for Spark v1.2.
+    """
+
+    def __init__(self, sc, java_model):
+        """
+        :param sc:  Spark context
+        :param java_model:  Handle to Java model object
+        """
+        self._sc = sc
+        self._java_model = java_model
+
+    def __del__(self):
+        self._sc._gateway.detach(self._java_model)
+
+    def predict(self, x):
+        """
+        Predict the label of one or more examples.
+        :param x:  Data point (feature vector),
+                   or an RDD of data points (feature vectors).
+        """
+        pythonAPI = self._sc._jvm.PythonMLLibAPI()
+        if isinstance(x, RDD):
+            # Bulk prediction
+            if x.count() == 0:
+                return self._sc.parallelize([])
+            dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
+            jSerializedPreds = \
+                pythonAPI.predictDecisionTreeModel(self._java_model,
+                                                   dataBytes._jrdd)
+            serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
+            return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
+        else:
+            # Assume x is a single data point.
+            x_ = _serialize_double_vector(x)
+            return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
+
+    def numNodes(self):
+        return self._java_model.numNodes()
+
+    def depth(self):
+        return self._java_model.depth()
+
+    def __str__(self):
+        return self._java_model.toString()
+
+
+class DecisionTree(object):
+    """
+    Learning algorithm for a decision tree model
+    for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified for Spark v1.2.
+
+    Example usage:
+    >>> from numpy import array, ndarray
+    >>> from pyspark.mllib.regression import LabeledPoint
+    >>> from pyspark.mllib.tree import DecisionTree
+    >>> from pyspark.mllib.linalg import SparseVector
+    >>>
+    >>> data = [
+    ...     LabeledPoint(0.0, [0.0]),
+    ...     LabeledPoint(1.0, [1.0]),
+    ...     LabeledPoint(1.0, [2.0]),
+    ...     LabeledPoint(1.0, [3.0])
+    ... ]
+    >>>
+    >>> model = DecisionTree.trainClassifier(sc.parallelize(data), numClasses=2)
+    >>> print(model)
+    DecisionTreeModel classifier
+      If (feature 0 <= 0.5)
+       Predict: 0.0
+      Else (feature 0 > 0.5)
+       Predict: 1.0
+
+    >>> model.predict(array([1.0])) > 0
+    True
+    >>> model.predict(array([0.0])) == 0
+    True
+    >>> sparse_data = [
+    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+    ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
+    ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
+    ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
+    ... ]
+    >>>
+    >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data))
+    >>> model.predict(array([0.0, 1.0])) == 1
+    True
+    >>> model.predict(array([0.0, 0.0])) == 0
+    True
+    >>> model.predict(SparseVector(2, {1: 1.0})) == 1
+    True
+    >>> model.predict(SparseVector(2, {1: 0.0})) == 0
+    True
+    """
+
+    @staticmethod
+    def trainClassifier(data, numClasses, categoricalFeaturesInfo={},
+                        impurity="gini", maxDepth=4, maxBins=100):
+        """
+        Train a DecisionTreeModel for classification.
+
+        :param data: Training data: RDD of LabeledPoint.
+                     Labels are integers {0,1,...,numClasses}.
+        :param numClasses: Number of classes for classification.
+        :param categoricalFeaturesInfo: Map from categorical feature index
+                                        to number of categories.
+                                        Any feature not in this map
+                                        is treated as continuous.
+        :param impurity: Supported values: "entropy" or "gini"
+        :param maxDepth: Max depth of tree.
+                         E.g., depth 0 means 1 leaf node.
+                         Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each node.
+        :return: DecisionTreeModel
+        """
+        return DecisionTree.train(data, "classification", numClasses,
+                                  categoricalFeaturesInfo,
+                                  impurity, maxDepth, maxBins)
+
+    @staticmethod
+    def trainRegressor(data, categoricalFeaturesInfo={},
+                       impurity="variance", maxDepth=4, maxBins=100):
+        """
+        Train a DecisionTreeModel for regression.
+
+        :param data: Training data: RDD of LabeledPoint.
+                     Labels are real numbers.
+        :param categoricalFeaturesInfo: Map from categorical feature index
+                                        to number of categories.
+                                        Any feature not in this map
+                                        is treated as continuous.
+        :param impurity: Supported values: "variance"
+        :param maxDepth: Max depth of tree.
+                         E.g., depth 0 means 1 leaf node.
+                         Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each node.
+        :return: DecisionTreeModel
+        """
+        return DecisionTree.train(data, "regression", 0,
+                                  categoricalFeaturesInfo,
+                                  impurity, maxDepth, maxBins)
+
+
+    @staticmethod
+    def train(data, algo, numClasses, categoricalFeaturesInfo,
+              impurity, maxDepth, maxBins=100):
+        """
+        Train a DecisionTreeModel for classification or regression.
+
+        :param data: Training data: RDD of LabeledPoint.
+                     For classification, labels are integers
+                      {0,1,...,numClasses}.
+                     For regression, labels are real numbers.
+        :param algo: "classification" or "regression"
+        :param numClasses: Number of classes for classification.
+        :param categoricalFeaturesInfo: Map from categorical feature index
+                                        to number of categories.
+                                        Any feature not in this map
+                                        is treated as continuous.
+        :param impurity: For classification: "entropy" or "gini".
+                         For regression: "variance".
+        :param maxDepth: Max depth of tree.
+                         E.g., depth 0 means 1 leaf node.
+                         Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each node.
+        :return: DecisionTreeModel
+        """
+        sc = data.context
+        dataBytes = _get_unmangled_labeled_point_rdd(data)
+        categoricalFeaturesInfoJMap = \
+            MapConverter().convert(categoricalFeaturesInfo,
+                                   sc._gateway._gateway_client)
+        model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
+            dataBytes._jrdd, algo,
+            numClasses, categoricalFeaturesInfoJMap,
+            impurity, maxDepth, maxBins)
+        dataBytes.unpersist()
+        return DecisionTreeModel(sc, model)
+
+
+def _test():
+    import doctest
+    globs = globals().copy()
+    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 5dbab5102e5f8..9ecceaead346f 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -16,6 +16,7 @@
 #
 
 import numpy as np
+import warnings
 
 from pyspark.mllib.linalg import Vectors, SparseVector
 from pyspark.mllib.regression import LabeledPoint
@@ -30,9 +31,9 @@ class MLUtils:
     Helper methods to load, save and pre-process data used in MLlib.
     """
 
-    @deprecated
     @staticmethod
     def _parse_libsvm_line(line, multiclass):
+        warnings.warn("deprecated", DeprecationWarning)
         return _parse_libsvm_line(line)
 
     @staticmethod
@@ -68,9 +69,9 @@ def _convert_labeled_point_to_libsvm(p):
                             " but got " % type(v))
         return " ".join(items)
 
-    @deprecated
     @staticmethod
     def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+        warnings.warn("deprecated", DeprecationWarning)
         return loadLibSVMFile(sc, path, numFeatures, minPartitions)
 
     @staticmethod
@@ -107,7 +108,6 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
         >>> tempFile.flush()
         >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
-        >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
         >>> tempFile.close()
         >>> type(examples[0]) == LabeledPoint
         True
@@ -116,21 +116,18 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         >>> type(examples[1]) == LabeledPoint
         True
         >>> print examples[1]
-        (0.0,(6,[],[]))
+        (-1.0,(6,[],[]))
         >>> type(examples[2]) == LabeledPoint
         True
         >>> print examples[2]
-        (0.0,(6,[1,3,5],[4.0,5.0,6.0]))
-        >>> multiclass_examples[1].label
-        -1.0
+        (-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
         """
 
         lines = sc.textFile(path, minPartitions)
         parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
-            numFeatures = parsed.map(
-                lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
diff --git a/python/pyspark/mllib/util.py.orig b/python/pyspark/mllib/util.py.orig
new file mode 100644
index 0000000000000..8f053aae42e88
--- /dev/null
+++ b/python/pyspark/mllib/util.py.orig
@@ -0,0 +1,211 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import warnings
+
+from pyspark.mllib.linalg import Vectors, SparseVector
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib._common import _convert_vector, _deserialize_labeled_point
+from pyspark.rdd import RDD
+from pyspark.serializers import NoOpSerializer
+
+
+class MLUtils:
+
+    """
+    Helper methods to load, save and pre-process data used in MLlib.
+    """
+
+    @staticmethod
+    def _parse_libsvm_line(line, multiclass):
+        warnings.warn("deprecated", DeprecationWarning)
+        return _parse_libsvm_line(line)
+
+    @staticmethod
+    def _parse_libsvm_line(line):
+        """
+        Parses a line in LIBSVM format into (label, indices, values).
+        """
+        items = line.split(None)
+        label = float(items[0])
+        nnz = len(items) - 1
+        indices = np.zeros(nnz, dtype=np.int32)
+        values = np.zeros(nnz)
+        for i in xrange(nnz):
+            index, value = items[1 + i].split(":")
+            indices[i] = int(index) - 1
+            values[i] = float(value)
+        return label, indices, values
+
+    @staticmethod
+    def _convert_labeled_point_to_libsvm(p):
+        """Converts a LabeledPoint to a string in LIBSVM format."""
+        items = [str(p.label)]
+        v = _convert_vector(p.features)
+        if type(v) == np.ndarray:
+            for i in xrange(len(v)):
+                items.append(str(i + 1) + ":" + str(v[i]))
+        elif type(v) == SparseVector:
+            nnz = len(v.indices)
+            for i in xrange(nnz):
+                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
+        else:
+            raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"
+                            " but got " % type(v))
+        return " ".join(items)
+
+    @staticmethod
+    def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
+        warnings.warn("deprecated", DeprecationWarning)
+        return loadLibSVMFile(sc, path, numFeatures, minPartitions)
+
+    @staticmethod
+    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
+        """
+        Loads labeled data in the LIBSVM format into an RDD of
+        LabeledPoint. The LIBSVM format is a text-based format used by
+        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
+        feature vector using the following format:
+
+        label index1:value1 index2:value2 ...
+
+        where the indices are one-based and in ascending order. This
+        method parses each line into a LabeledPoint, where the feature
+        indices are converted to zero-based.
+
+        @param sc: Spark context
+        @param path: file or directory path in any Hadoop-supported file
+                     system URI
+        @param numFeatures: number of features, which will be determined
+                            from the input data if a nonpositive value
+                            is given. This is useful when the dataset is
+                            already split into multiple files and you
+                            want to load them separately, because some
+                            features may not present in certain files,
+                            which leads to inconsistent feature
+                            dimensions.
+        @param minPartitions: min number of partitions
+        @return: labeled data stored as an RDD of LabeledPoint
+
+        >>> from tempfile import NamedTemporaryFile
+        >>> from pyspark.mllib.util import MLUtils
+        >>> tempFile = NamedTemporaryFile(delete=True)
+        >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
+        >>> tempFile.flush()
+        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
+        >>> tempFile.close()
+        >>> type(examples[0]) == LabeledPoint
+        True
+        >>> print examples[0]
+        (1.0,(6,[0,2,4],[1.0,2.0,3.0]))
+        >>> type(examples[1]) == LabeledPoint
+        True
+        >>> print examples[1]
+        (-1.0,(6,[],[]))
+        >>> type(examples[2]) == LabeledPoint
+        True
+        >>> print examples[2]
+        (-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
+        """
+
+        lines = sc.textFile(path, minPartitions)
+        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
+        if numFeatures <= 0:
+            parsed.cache()
+<<<<<<< HEAD
+            numFeatures = parsed.map(
+                lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+=======
+            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+>>>>>>> 3dc55fdf450b4237f7c592fce56d1467fd206366
+        return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
+
+    @staticmethod
+    def saveAsLibSVMFile(data, dir):
+        """
+        Save labeled data in LIBSVM format.
+
+        @param data: an RDD of LabeledPoint to be saved
+        @param dir: directory to save the data
+
+        >>> from tempfile import NamedTemporaryFile
+        >>> from fileinput import input
+        >>> from glob import glob
+        >>> from pyspark.mllib.util import MLUtils
+        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
+                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
+        >>> tempFile = NamedTemporaryFile(delete=True)
+        >>> tempFile.close()
+        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
+        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
+        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
+        """
+        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
+        lines.saveAsTextFile(dir)
+
+    @staticmethod
+    def loadLabeledPoints(sc, path, minPartitions=None):
+        """
+        Load labeled points saved using RDD.saveAsTextFile.
+
+        @param sc: Spark context
+        @param path: file or directory path in any Hadoop-supported file
+                     system URI
+        @param minPartitions: min number of partitions
+        @return: labeled data stored as an RDD of LabeledPoint
+
+        >>> from tempfile import NamedTemporaryFile
+        >>> from pyspark.mllib.util import MLUtils
+        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
+                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
+        >>> tempFile = NamedTemporaryFile(delete=True)
+        >>> tempFile.close()
+        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
+        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
+        >>> type(loaded[0]) == LabeledPoint
+        True
+        >>> print examples[0]
+        (1.1,(3,[0,2],[-1.23,4.56e-07]))
+        >>> type(examples[1]) == LabeledPoint
+        True
+        >>> print examples[1]
+        (0.0,[1.01,2.02,3.03])
+        """
+        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
+        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
+            sc._jsc, path, minPartitions)
+        serialized = RDD(jSerialized, sc, NoOpSerializer())
+        return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    globs = globals().copy()
+    # The small batch size here ensures that we see multiple batches,
+    # even in these small test examples:
+    globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 4a4b5c8a476fb..cabd4fa71222d 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -28,9 +28,13 @@
 from operator import itemgetter
 
 from pyspark.rdd import RDD, PipelinedRDD
-from pyspark.serializers import BatchedSerializer, PickleSerializer
+from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer
+
+from itertools import chain, ifilter, imap
 
 from py4j.protocol import Py4JError
+from py4j.java_collections import ListConverter, MapConverter
+
 
 __all__ = [
     "StringType", "BinaryType", "BooleanType", "TimestampType", "DecimalType",
@@ -926,7 +930,7 @@ def __init__(self, sparkContext, sqlContext=None):
         ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
         ...     time=datetime(2014, 8, 1, 14, 1, 5))])
         >>> srdd = sqlCtx.inferSchema(allTypes)
-        >>> srdd.registerAsTable("allTypes")
+        >>> srdd.registerTempTable("allTypes")
         >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
         ...            'from allTypes where b and i > 0').collect()
         [Row(c0=2, c1=2.0, c2=False, c3=2, c4=0...8, 1, 14, 1, 5), a=1)]
@@ -953,6 +957,39 @@ def _ssql_ctx(self):
             self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
         return self._scala_SQLContext
 
+    def registerFunction(self, name, f, returnType=StringType()):
+        """Registers a lambda function as a UDF so it can be used in SQL statements.
+
+        In addition to a name and the function itself, the return type can be optionally specified.
+        When the return type is not given it default to a string and conversion will automatically
+        be done.  For any other return type, the produced object must match the specified type.
+
+        >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x))
+        >>> sqlCtx.sql("SELECT stringLengthString('test')").collect()
+        [Row(c0=u'4')]
+        >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect()
+        [Row(c0=4)]
+        >>> sqlCtx.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType())
+        >>> sqlCtx.sql("SELECT twoArgs('test', 1)").collect()
+        [Row(c0=5)]
+        """
+        func = lambda _, it: imap(lambda x: f(*x), it)
+        command = (func,
+                   BatchedSerializer(PickleSerializer(), 1024),
+                   BatchedSerializer(PickleSerializer(), 1024))
+        env = MapConverter().convert(self._sc.environment,
+                                     self._sc._gateway._gateway_client)
+        includes = ListConverter().convert(self._sc._python_includes,
+                                     self._sc._gateway._gateway_client)
+        self._ssql_ctx.registerPython(name,
+                                      bytearray(CloudPickleSerializer().dumps(command)),
+                                      env,
+                                      includes,
+                                      self._sc.pythonExec,
+                                      self._sc._javaAccumulator,
+                                      str(returnType))
+
     def inferSchema(self, rdd):
         """Infer and apply a schema to an RDD of L{Row}s.
 
@@ -1473,19 +1510,23 @@ def saveAsParquetFile(self, path):
         """
         self._jschema_rdd.saveAsParquetFile(path)
 
-    def registerAsTable(self, name):
+    def registerTempTable(self, name):
         """Registers this RDD as a temporary table using the given name.
 
         The lifetime of this temporary table is tied to the L{SQLContext}
         that was used to create this SchemaRDD.
 
         >>> srdd = sqlCtx.inferSchema(rdd)
-        >>> srdd.registerAsTable("test")
+        >>> srdd.registerTempTable("test")
         >>> srdd2 = sqlCtx.sql("select * from test")
         >>> sorted(srdd.collect()) == sorted(srdd2.collect())
         True
         """
-        self._jschema_rdd.registerAsTable(name)
+        self._jschema_rdd.registerTempTable(name)
+
+    def registerAsTable(self, name):
+        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+        self.registerTempTable(name)
 
     def insertInto(self, tableName, overwrite=False):
         """Inserts the contents of this SchemaRDD into the specified table.
@@ -1576,9 +1617,9 @@ def persist(self, storageLevel):
         self._jschema_rdd.persist(javaStorageLevel)
         return self
 
-    def unpersist(self):
+    def unpersist(self, blocking=True):
         self.is_cached = False
-        self._jschema_rdd.unpersist()
+        self._jschema_rdd.unpersist(blocking)
         return self
 
     def checkpoint(self):
diff --git a/python/run-tests b/python/run-tests
index 5049e15ce5f8a..48feba2f5bd63 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -71,6 +71,7 @@ run_test "pyspark/mllib/random.py"
 run_test "pyspark/mllib/recommendation.py"
 run_test "pyspark/mllib/regression.py"
 run_test "pyspark/mllib/tests.py"
+run_test "pyspark/mllib/util.py"
 
 if [[ $FAILED == 0 ]]; then
     echo -en "\033[32m"  # Green
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 54fa96baa1e18..58d44e7923bee 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -54,11 +54,6 @@
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>com.typesafe</groupId>
-      <artifactId>scalalogging-slf4j_${scala.binary.version}</artifactId>
-      <version>1.0.1</version>
-    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 74c0104e5b17f..2ba68cab115fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -109,12 +109,12 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
   object ResolveReferences extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case q: LogicalPlan if q.childrenResolved =>
-        logger.trace(s"Attempting to resolve ${q.simpleString}")
+        logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressions {
           case u @ UnresolvedAttribute(name) =>
             // Leave unchanged if resolution fails.  Hopefully will be resolved next round.
             val result = q.resolve(name).getOrElse(u)
-            logger.debug(s"Resolving $u to $result")
+            logDebug(s"Resolving $u to $result")
             result
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index c0255701b7ba5..760c49fbca4a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -18,17 +18,49 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.catalyst.expressions.Expression
+import scala.collection.mutable
 
 /** A catalog for looking up user defined functions, used by an [[Analyzer]]. */
 trait FunctionRegistry {
+  type FunctionBuilder = Seq[Expression] => Expression
+
+  def registerFunction(name: String, builder: FunctionBuilder): Unit
+
   def lookupFunction(name: String, children: Seq[Expression]): Expression
 }
 
+trait OverrideFunctionRegistry extends FunctionRegistry {
+
+  val functionBuilders = new mutable.HashMap[String, FunctionBuilder]()
+
+  def registerFunction(name: String, builder: FunctionBuilder) = {
+    functionBuilders.put(name, builder)
+  }
+
+  abstract override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
+    functionBuilders.get(name).map(_(children)).getOrElse(super.lookupFunction(name,children))
+  }
+}
+
+class SimpleFunctionRegistry extends FunctionRegistry {
+  val functionBuilders = new mutable.HashMap[String, FunctionBuilder]()
+
+  def registerFunction(name: String, builder: FunctionBuilder) = {
+    functionBuilders.put(name, builder)
+  }
+
+  override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
+    functionBuilders(name)(children)
+  }
+}
+
 /**
  * A trivial catalog that returns an error when a function is requested.  Used for testing when all
  * functions are already filled in and the analyser needs only to resolve attribute references.
  */
 object EmptyFunctionRegistry extends FunctionRegistry {
+  def registerFunction(name: String, builder: FunctionBuilder) = ???
+
   def lookupFunction(name: String, children: Seq[Expression]): Expression = {
     throw new UnsupportedOperationException
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 47c7ad076ad07..e94f2a3bea63e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -75,7 +75,7 @@ trait HiveTypeCoercion {
             // Leave the same if the dataTypes match.
             case Some(newType) if a.dataType == newType.dataType => a
             case Some(newType) =>
-              logger.debug(s"Promoting $a to $newType in ${q.simpleString}}")
+              logDebug(s"Promoting $a to $newType in ${q.simpleString}}")
               newType
           }
       }
@@ -154,7 +154,7 @@ trait HiveTypeCoercion {
             (Alias(Cast(l, StringType), l.name)(), r)
 
           case (l, r) if l.dataType != r.dataType =>
-            logger.debug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
+            logDebug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
             findTightestCommonType(l.dataType, r.dataType).map { widestType =>
               val newLeft =
                 if (l.dataType == widestType) l else Alias(Cast(l, widestType), l.name)()
@@ -170,7 +170,7 @@ trait HiveTypeCoercion {
 
         val newLeft =
           if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-            logger.debug(s"Widening numeric types in union $castedLeft ${left.output}")
+            logDebug(s"Widening numeric types in union $castedLeft ${left.output}")
             Project(castedLeft, left)
           } else {
             left
@@ -178,7 +178,7 @@ trait HiveTypeCoercion {
 
         val newRight =
           if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-            logger.debug(s"Widening numeric types in union $castedRight ${right.output}")
+            logDebug(s"Widening numeric types in union $castedRight ${right.output}")
             Project(castedRight, right)
           } else {
             right
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index f38f99569f207..0913f15888780 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.trees
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
index acddf5e9c7004..95633dd0c9870 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUdf.scala
@@ -27,6 +27,22 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
   def references = children.flatMap(_.references).toSet
   def nullable = true
 
+  /** This method has been generated by this script
+
+    (1 to 22).map { x =>
+      val anys = (1 to x).map(x => "Any").reduce(_ + ", " + _)
+      val evals = (0 to x - 1).map(x => s"children($x).eval(input)").reduce(_ + ",\n    " + _)
+
+    s"""
+    case $x =>
+      function.asInstanceOf[($anys) => Any](
+      $evals)
+    """
+    }
+
+  */
+
+  // scalastyle:off
   override def eval(input: Row): Any = {
     children.size match {
       case 0 => function.asInstanceOf[() => Any]()
@@ -35,6 +51,297 @@ case class ScalaUdf(function: AnyRef, dataType: DataType, children: Seq[Expressi
         function.asInstanceOf[(Any, Any) => Any](
           children(0).eval(input),
           children(1).eval(input))
+      case 3 =>
+        function.asInstanceOf[(Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input))
+      case 4 =>
+        function.asInstanceOf[(Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input))
+      case 5 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input))
+      case 6 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input))
+      case 7 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input))
+      case 8 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input))
+      case 9 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input))
+      case 10 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input))
+      case 11 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input))
+      case 12 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input))
+      case 13 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input))
+      case 14 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input))
+      case 15 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input))
+      case 16 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input))
+      case 17 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input))
+      case 18 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input))
+      case 19 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input))
+      case 20 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input),
+          children(19).eval(input))
+      case 21 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input),
+          children(19).eval(input),
+          children(20).eval(input))
+      case 22 =>
+        function.asInstanceOf[(Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any) => Any](
+          children(0).eval(input),
+          children(1).eval(input),
+          children(2).eval(input),
+          children(3).eval(input),
+          children(4).eval(input),
+          children(5).eval(input),
+          children(6).eval(input),
+          children(7).eval(input),
+          children(8).eval(input),
+          children(9).eval(input),
+          children(10).eval(input),
+          children(11).eval(input),
+          children(12).eval(input),
+          children(13).eval(input),
+          children(14).eval(input),
+          children(15).eval(input),
+          children(16).eval(input),
+          children(17).eval(input),
+          children(18).eval(input),
+          children(19).eval(input),
+          children(20).eval(input),
+          children(21).eval(input))
     }
+    // scalastyle:on
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 4211998f7511a..094ff14552283 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
-import com.typesafe.scalalogging.slf4j.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types.{StringType, NumericType}
 
@@ -92,7 +92,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] wit
       }
       new $orderingName()
       """
-    logger.debug(s"Generated Ordering: $code")
+    logDebug(s"Generated Ordering: $code")
     toolBox.eval(code).asInstanceOf[Ordering[Row]]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
index ca9642954eb27..bdd07bbeb2230 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
@@ -25,5 +25,4 @@ package object catalyst {
    */
   protected[catalyst] object ScalaReflectionLock
 
-  protected[catalyst] type Logging = com.typesafe.scalalogging.slf4j.Logging
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
index 781ba489b44c6..5839c9f7c43ef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.planning
 
-import org.apache.spark.sql.catalyst.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index bc763a4e06e67..90923fe31a063 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.planning
 import scala.annotation.tailrec
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 
@@ -184,7 +184,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
 
   def unapply(plan: LogicalPlan): Option[ReturnType] = plan match {
     case join @ Join(left, right, joinType, condition) =>
-      logger.debug(s"Considering join on: $condition")
+      logDebug(s"Considering join on: $condition")
       // Find equi-join predicates that can be evaluated before the join, and thus can be used
       // as join keys.
       val (joinPredicates, otherPredicates) = 
@@ -202,7 +202,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       val rightKeys = joinKeys.map(_._2)
 
       if (joinKeys.nonEmpty) {
-        logger.debug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}")
+        logDebug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}")
         Some((joinType, leftKeys, rightKeys, otherPredicates.reduceOption(And), left, right))
       } else {
         None
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
index f8960b3fe7a17..03414b2301e81 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.rules
 
-import org.apache.spark.sql.catalyst.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 abstract class Rule[TreeType <: TreeNode[_]] extends Logging {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 6aa407c836aec..d192b151ac1c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.rules
 
-import org.apache.spark.sql.catalyst.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.sideBySide
 
@@ -60,7 +60,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
           case (plan, rule) =>
             val result = rule(plan)
             if (!result.fastEquals(plan)) {
-              logger.trace(
+              logTrace(
                 s"""
                   |=== Applying Rule ${rule.ruleName} ===
                   |${sideBySide(plan.treeString, result.treeString).mkString("\n")}
@@ -73,26 +73,26 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
         if (iteration > batch.strategy.maxIterations) {
           // Only log if this is a rule that is supposed to run more than once.
           if (iteration != 2) {
-            logger.info(s"Max iterations (${iteration - 1}) reached for batch ${batch.name}")
+            logInfo(s"Max iterations (${iteration - 1}) reached for batch ${batch.name}")
           }
           continue = false
         }
 
         if (curPlan.fastEquals(lastPlan)) {
-          logger.trace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.")
+          logTrace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.")
           continue = false
         }
         lastPlan = curPlan
       }
 
       if (!batchStartPlan.fastEquals(curPlan)) {
-        logger.debug(
+        logDebug(
           s"""
           |=== Result of Batch ${batch.name} ===
           |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
         """.stripMargin)
       } else {
-        logger.trace(s"Batch ${batch.name} has no effect.")
+        logTrace(s"Batch ${batch.name} has no effect.")
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala
index 9a28d035a10a3..d725a92c06f7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.Logging
+
 /**
  * A library for easily manipulating trees of operators.  Operators that extend TreeNode are
  * granted the following interface:
@@ -31,8 +33,8 @@ package org.apache.spark.sql.catalyst
  *   <li>debugging support - pretty printing, easy splicing of trees, etc.</li>
  * </ul>
  */
-package object trees {
+package object trees extends Logging {
   // Since we want tree nodes to be lightweight, we create one logger for all treenode instances.
-  protected val logger =
-    com.typesafe.scalalogging.slf4j.Logger(org.slf4j.LoggerFactory.getLogger("catalyst.trees"))
+  protected override def logName = "catalyst.trees"
+
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
new file mode 100644
index 0000000000000..ef959e35e1027
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF1.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 1 arguments.
+ */
+public interface UDF1<T1, R> extends Serializable {
+  public R call(T1 t1) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
new file mode 100644
index 0000000000000..96ab3a96c3d5e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF10.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 10 arguments.
+ */
+public interface UDF10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
new file mode 100644
index 0000000000000..58ae8edd6d817
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF11.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 11 arguments.
+ */
+public interface UDF11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
new file mode 100644
index 0000000000000..d9da0f6eddd94
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF12.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 12 arguments.
+ */
+public interface UDF12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
new file mode 100644
index 0000000000000..095fc1a8076b5
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF13.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 13 arguments.
+ */
+public interface UDF13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
new file mode 100644
index 0000000000000..eb27eaa180086
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF14.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 14 arguments.
+ */
+public interface UDF14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
new file mode 100644
index 0000000000000..1fbcff56332b6
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF15.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 15 arguments.
+ */
+public interface UDF15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
new file mode 100644
index 0000000000000..1133561787a69
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF16.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 16 arguments.
+ */
+public interface UDF16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
new file mode 100644
index 0000000000000..dfae7922c9b63
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF17.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 17 arguments.
+ */
+public interface UDF17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
new file mode 100644
index 0000000000000..e9d1c6d52d4ea
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF18.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 18 arguments.
+ */
+public interface UDF18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
new file mode 100644
index 0000000000000..46b9d2d3c9457
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF19.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 19 arguments.
+ */
+public interface UDF19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
new file mode 100644
index 0000000000000..cd3fde8da419e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF2.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 2 arguments.
+ */
+public interface UDF2<T1, T2, R> extends Serializable {
+  public R call(T1 t1, T2 t2) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
new file mode 100644
index 0000000000000..113d3d26be4a7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF20.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 20 arguments.
+ */
+public interface UDF20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
new file mode 100644
index 0000000000000..74118f2cf8da7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF21.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 21 arguments.
+ */
+public interface UDF21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
new file mode 100644
index 0000000000000..0e7cc40be45ec
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF22.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 22 arguments.
+ */
+public interface UDF22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9, T10 t10, T11 t11, T12 t12, T13 t13, T14 t14, T15 t15, T16 t16, T17 t17, T18 t18, T19 t19, T20 t20, T21 t21, T22 t22) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
new file mode 100644
index 0000000000000..6a880f16be47a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF3.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 3 arguments.
+ */
+public interface UDF3<T1, T2, T3, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
new file mode 100644
index 0000000000000..fcad2febb18e6
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF4.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 4 arguments.
+ */
+public interface UDF4<T1, T2, T3, T4, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
new file mode 100644
index 0000000000000..ce0cef43a2144
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF5.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 5 arguments.
+ */
+public interface UDF5<T1, T2, T3, T4, T5, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
new file mode 100644
index 0000000000000..f56b806684e61
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF6.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 6 arguments.
+ */
+public interface UDF6<T1, T2, T3, T4, T5, T6, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
new file mode 100644
index 0000000000000..25bd6d3241bd4
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF7.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 7 arguments.
+ */
+public interface UDF7<T1, T2, T3, T4, T5, T6, T7, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
new file mode 100644
index 0000000000000..a3b7ac5f94ce7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF8.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 8 arguments.
+ */
+public interface UDF8<T1, T2, T3, T4, T5, T6, T7, T8, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8) throws Exception;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
new file mode 100644
index 0000000000000..205e72a1522fc
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF9.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+// **************************************************
+// THIS FILE IS AUTOGENERATED BY CODE IN
+// org.apache.spark.sql.api.java.FunctionRegistration
+// **************************************************
+
+/**
+ * A Spark SQL UDF that has 9 arguments.
+ */
+public interface UDF9<T1, T2, T3, T4, T5, T6, T7, T8, T9, R> extends Serializable {
+  public R call(T1 t1, T2 t2, T3 t3, T4 t4, T5 t5, T6 t6, T7 t7, T8 t8, T9 t9) throws Exception;
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index dad71079c29b9..567f4dca991b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.SparkStrategies
 import org.apache.spark.sql.json._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.SparkContext
+import org.apache.spark.{Logging, SparkContext}
 
 /**
  * :: AlphaComponent ::
@@ -48,18 +48,23 @@ import org.apache.spark.SparkContext
  */
 @AlphaComponent
 class SQLContext(@transient val sparkContext: SparkContext)
-  extends Logging
+  extends org.apache.spark.Logging
   with SQLConf
   with ExpressionConversions
+  with UDFRegistration
   with Serializable {
 
   self =>
 
   @transient
   protected[sql] lazy val catalog: Catalog = new SimpleCatalog(true)
+
+  @transient
+  protected[sql] lazy val functionRegistry: FunctionRegistry = new SimpleFunctionRegistry
+
   @transient
   protected[sql] lazy val analyzer: Analyzer =
-    new Analyzer(catalog, EmptyFunctionRegistry, caseSensitive = true)
+    new Analyzer(catalog, functionRegistry, caseSensitive = true)
   @transient
   protected[sql] val optimizer = Optimizer
   @transient
@@ -111,7 +116,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *  // |-- name: string (nullable = false)
    *  // |-- age: integer (nullable = true)
    *
-   *    peopleSchemaRDD.registerAsTable("people")
+   *    peopleSchemaRDD.registerTempTable("people")
    *  sqlContext.sql("select name from people").collect.foreach(println)
    * }}}
    *
@@ -207,7 +212,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *   import sqlContext._
    *
    *   case class Person(name: String, age: Int)
-   *   createParquetFile[Person]("path/to/file.parquet").registerAsTable("people")
+   *   createParquetFile[Person]("path/to/file.parquet").registerTempTable("people")
    *   sql("INSERT INTO people SELECT 'michael', 29")
    * }}}
    *
@@ -379,7 +384,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected abstract class QueryExecution {
     def logical: LogicalPlan
 
-    lazy val analyzed = analyzer(logical)
+    lazy val analyzed = ExtractPythonUdfs(analyzer(logical))
     lazy val optimizedPlan = optimizer(analyzed)
     // TODO: Don't just pick the first one...
     lazy val sparkPlan = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index d34f62dc8865e..57df79321b35d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -67,7 +67,7 @@ import org.apache.spark.api.java.JavaRDD
  *  val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
  *  // Any RDD containing case classes can be registered as a table.  The schema of the table is
  *  // automatically inferred using scala reflection.
- *  rdd.registerAsTable("records")
+ *  rdd.registerTempTable("records")
  *
  *  val results: SchemaRDD = sql("SELECT * FROM records")
  * }}}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
index 6a20def475822..2f3033a5f94f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala
@@ -83,10 +83,13 @@ private[sql] trait SchemaRDDLike {
    *
    * @group schema
    */
-  def registerAsTable(tableName: String): Unit = {
+  def registerTempTable(tableName: String): Unit = {
     sqlContext.registerRDDAsTable(baseSchemaRDD, tableName)
   }
 
+  @deprecated("Use registerTempTable instead of registerAsTable.", "1.1")
+  def registerAsTable(tableName: String): Unit = registerTempTable(tableName)
+
   /**
    * :: Experimental ::
    * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
new file mode 100644
index 0000000000000..0b48e9e659faa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.{List => JList, Map => JMap}
+
+import org.apache.spark.Accumulator
+import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
+import org.apache.spark.sql.execution.PythonUDF
+
+import scala.reflect.runtime.universe.{TypeTag, typeTag}
+
+/**
+ * Functions for registering scala lambda functions as UDFs in a SQLContext.
+ */
+protected[sql] trait UDFRegistration {
+  self: SQLContext =>
+
+  private[spark] def registerPython(
+      name: String,
+      command: Array[Byte],
+      envVars: JMap[String, String],
+      pythonIncludes: JList[String],
+      pythonExec: String,
+      accumulator: Accumulator[JList[Array[Byte]]],
+      stringDataType: String): Unit = {
+    log.debug(
+      s"""
+        | Registering new PythonUDF:
+        | name: $name
+        | command: ${command.toSeq}
+        | envVars: $envVars
+        | pythonIncludes: $pythonIncludes
+        | pythonExec: $pythonExec
+        | dataType: $stringDataType
+      """.stripMargin)
+
+
+    val dataType = parseDataType(stringDataType)
+
+    def builder(e: Seq[Expression]) =
+      PythonUDF(
+        name,
+        command,
+        envVars,
+        pythonIncludes,
+        pythonExec,
+        accumulator,
+        dataType,
+        e)
+
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  /** registerFunction 1-22 were generated by this script
+
+    (1 to 22).map { x =>
+      val types = (1 to x).map(x => "_").reduce(_ + ", " + _)
+      s"""
+        def registerFunction[T: TypeTag](name: String, func: Function$x[$types, T]): Unit = {
+          def builder(e: Seq[Expression]) =
+            ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+          functionRegistry.registerFunction(name, builder)
+        }
+      """
+    }
+  */
+
+  // scalastyle:off
+  def registerFunction[T: TypeTag](name: String, func: Function1[_, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function2[_, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function3[_, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function4[_, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function5[_, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function6[_, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function7[_, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function8[_, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function9[_, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function10[_, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+
+  def registerFunction[T: TypeTag](name: String, func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): Unit = {
+    def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e)
+    functionRegistry.registerFunction(name, builder)
+  }
+  // scalastyle:on
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
index 809dd038f94aa..dbaa16e8b0c68 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
@@ -28,14 +28,13 @@ import org.apache.spark.sql.{SQLContext, StructType => SStructType}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRow, Row => ScalaRow}
 import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.execution.{ExistingRdd, SparkLogicalPlan}
-import org.apache.spark.sql.types.util.DataTypeConversions
-import DataTypeConversions.asScalaDataType;
+import org.apache.spark.sql.types.util.DataTypeConversions.asScalaDataType
 import org.apache.spark.util.Utils
 
 /**
  * The entry point for executing Spark SQL queries from a Java program.
  */
-class JavaSQLContext(val sqlContext: SQLContext) {
+class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration {
 
   def this(sparkContext: JavaSparkContext) = this(new SQLContext(sparkContext.sc))
 
@@ -53,7 +52,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
    * {{{
    *   JavaSQLContext sqlCtx = new JavaSQLContext(...)
    *
-   *   sqlCtx.createParquetFile(Person.class, "path/to/file.parquet").registerAsTable("people")
+   *   sqlCtx.createParquetFile(Person.class, "path/to/file.parquet").registerTempTable("people")
    *   sqlCtx.sql("INSERT INTO people SELECT 'michael', 29")
    * }}}
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
new file mode 100644
index 0000000000000..158f26e3d445f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/UDFRegistration.scala
@@ -0,0 +1,252 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.api.java
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUdf}
+import org.apache.spark.sql.types.util.DataTypeConversions._
+
+/**
+ * A collection of functions that allow Java users to register UDFs.  In order to handle functions
+ * of varying airities with minimal boilerplate for our users, we generate classes and functions
+ * for each airity up to 22.  The code for this generation can be found in comments in this trait.
+ */
+private[java] trait UDFRegistration {
+  self: JavaSQLContext =>
+
+  /* The following functions and required interfaces are generated with these code fragments:
+
+   (1 to 22).foreach { i =>
+     val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
+     val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
+     val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
+     val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+     println(s"""
+         |def registerFunction(
+         |    name: String, f: UDF$i[$extTypeArgs, _], @transient dataType: DataType) = {
+         |  val scalaType = asScalaDataType(dataType)
+         |  sqlContext.functionRegistry.registerFunction(
+         |    name,
+         |    (e: Seq[Expression]) => ScalaUdf(f$anyCast.call($anyParams), scalaType, e))
+         |}
+       """.stripMargin)
+   }
+
+  import java.io.File
+  import org.apache.spark.sql.catalyst.util.stringToFile
+  val directory = new File("sql/core/src/main/java/org/apache/spark/sql/api/java/")
+  (1 to 22).foreach { i =>
+    val typeArgs = (1 to i).map(i => s"T$i").mkString(", ")
+    val args = (1 to i).map(i => s"T$i t$i").mkString(", ")
+
+    val contents =
+      s"""/*
+         | * Licensed to the Apache Software Foundation (ASF) under one or more
+         | * contributor license agreements.  See the NOTICE file distributed with
+         | * this work for additional information regarding copyright ownership.
+         | * The ASF licenses this file to You under the Apache License, Version 2.0
+         | * (the "License"); you may not use this file except in compliance with
+         | * the License.  You may obtain a copy of the License at
+         | *
+         | *    http://www.apache.org/licenses/LICENSE-2.0
+         | *
+         | * Unless required by applicable law or agreed to in writing, software
+         | * distributed under the License is distributed on an "AS IS" BASIS,
+         | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+         | * See the License for the specific language governing permissions and
+         | * limitations under the License.
+         | */
+         |
+         |package org.apache.spark.sql.api.java;
+         |
+         |import java.io.Serializable;
+         |
+         |// **************************************************
+         |// THIS FILE IS AUTOGENERATED BY CODE IN
+         |// org.apache.spark.sql.api.java.FunctionRegistration
+         |// **************************************************
+         |
+         |/**
+         | * A Spark SQL UDF that has $i arguments.
+         | */
+         |public interface UDF$i<$typeArgs, R> extends Serializable {
+         |  public R call($args) throws Exception;
+         |}
+         |""".stripMargin
+
+      stringToFile(new File(directory, s"UDF$i.java"), contents)
+  }
+
+  */
+
+  // scalastyle:off
+  def registerFunction(name: String, f: UDF1[_, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF2[_, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF3[_, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF4[_, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF5[_, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF6[_, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF7[_, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  def registerFunction(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], dataType: DataType) = {
+    val scalaType = asScalaDataType(dataType)
+    sqlContext.functionRegistry.registerFunction(
+      name,
+      (e: Seq[Expression]) => ScalaUdf(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), scalaType, e))
+  }
+
+  // scalastyle:on
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
index 4c6675c3c87bf..6ad12a0dcb64d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressibleColumnBuilder.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql.columnar.compression
 
 import java.nio.{ByteBuffer, ByteOrder}
 
-import org.apache.spark.sql.{Logging, Row}
+import org.apache.spark.Logging
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.types.NativeType
 import org.apache.spark.sql.columnar.{ColumnBuilder, NativeColumnBuilder}
 
@@ -101,7 +102,7 @@ private[sql] trait CompressibleColumnBuilder[T <: NativeType]
 
     copyColumnHeader(rawBuffer, compressedBuffer)
 
-    logger.info(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
+    logInfo(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
     encoder.compress(rawBuffer, compressedBuffer, columnType)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 30712f03cab4c..77dc2ad733215 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -101,7 +101,7 @@ private[sql] case class AddExchange(sqlContext: SQLContext) extends Rule[SparkPl
         !operator.requiredChildDistribution.zip(operator.children).map {
           case (required, child) =>
             val valid = child.outputPartitioning.satisfies(required)
-            logger.debug(
+            logDebug(
               s"${if (valid) "Valid" else "Invalid"} distribution," +
                 s"required: $required current: ${child.outputPartitioning}")
             valid
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 8bec015c7b465..f0c958fdb537f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -286,6 +286,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.ExistingRdd(Nil, singleRowRdd) :: Nil
       case logical.Repartition(expressions, child) =>
         execution.Exchange(HashPartitioning(expressions, numPartitions), planLater(child)) :: Nil
+      case e @ EvaluatePython(udf, child) =>
+        BatchPythonEvaluation(udf, e.output, planLater(child)) :: Nil
       case SparkLogicalPlan(existingPlan) => existingPlan :: Nil
       case _ => Nil
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
index cc138c749949d..51bb61530744c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala
@@ -405,8 +405,7 @@ case class BroadcastHashJoin(
      left: SparkPlan,
      right: SparkPlan) extends BinaryNode with HashJoin {
 
-
-  override def outputPartitioning: Partitioning = left.outputPartitioning
+  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning
 
   override def requiredChildDistribution =
     UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
new file mode 100644
index 0000000000000..b92091b560b1c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -0,0 +1,177 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution
+
+import java.util.{List => JList, Map => JMap}
+
+import net.razorvine.pickle.{Pickler, Unpickler}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.types._
+import org.apache.spark.{Accumulator, Logging => SparkLogging}
+
+import scala.collection.JavaConversions._
+
+/**
+ * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
+ */
+private[spark] case class PythonUDF(
+    name: String,
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    pythonExec: String,
+    accumulator: Accumulator[JList[Array[Byte]]],
+    dataType: DataType,
+    children: Seq[Expression]) extends Expression with SparkLogging {
+
+  override def toString = s"PythonUDF#$name(${children.mkString(",")})"
+
+  def nullable: Boolean = true
+  def references: Set[Attribute] = children.flatMap(_.references).toSet
+
+  override def eval(input: Row) = sys.error("PythonUDFs can not be directly evaluated.")
+}
+
+/**
+ * Extracts PythonUDFs from operators, rewriting the query plan so that the UDF can be evaluated
+ * alone in a batch.
+ *
+ * This has the limitation that the input to the Python UDF is not allowed include attributes from
+ * multiple child operators.
+ */
+private[spark] object ExtractPythonUdfs extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan) = plan transform {
+    // Skip EvaluatePython nodes.
+    case p: EvaluatePython => p
+
+    case l: LogicalPlan =>
+      // Extract any PythonUDFs from the current operator.
+      val udfs = l.expressions.flatMap(_.collect { case udf: PythonUDF => udf})
+      if (udfs.isEmpty) {
+        // If there aren't any, we are done.
+        l
+      } else {
+        // Pick the UDF we are going to evaluate (TODO: Support evaluating multiple UDFs at a time)
+        // If there is more than one, we will add another evaluation operator in a subsequent pass.
+        val udf = udfs.head
+
+        var evaluation: EvaluatePython = null
+
+        // Rewrite the child that has the input required for the UDF
+        val newChildren = l.children.map { child =>
+          // Check to make sure that the UDF can be evaluated with only the input of this child.
+          // Other cases are disallowed as they are ambiguous or would require a cartisian product.
+          if (udf.references.subsetOf(child.outputSet)) {
+            evaluation = EvaluatePython(udf, child)
+            evaluation
+          } else if (udf.references.intersect(child.outputSet).nonEmpty) {
+            sys.error(s"Invalid PythonUDF $udf, requires attributes from more than one child.")
+          } else {
+            child
+          }
+        }
+
+        assert(evaluation != null, "Unable to evaluate PythonUDF.  Missing input attributes.")
+
+        // Trim away the new UDF value if it was only used for filtering or something.
+        logical.Project(
+          l.output,
+          l.transformExpressions {
+            case p: PythonUDF if p.id == udf.id => evaluation.resultAttribute
+          }.withNewChildren(newChildren))
+      }
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Evaluates a [[PythonUDF]], appending the result to the end of the input tuple.
+ */
+@DeveloperApi
+case class EvaluatePython(udf: PythonUDF, child: LogicalPlan) extends logical.UnaryNode {
+  val resultAttribute = AttributeReference("pythonUDF", udf.dataType, nullable=true)()
+
+  def references = Set.empty
+  def output = child.output :+ resultAttribute
+}
+
+/**
+ * :: DeveloperApi ::
+ * Uses PythonRDD to evaluate a [[PythonUDF]], one partition of tuples at a time.  The input
+ * data is cached and zipped with the result of the udf evaluation.
+ */
+@DeveloperApi
+case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child: SparkPlan)
+  extends SparkPlan {
+  def children = child :: Nil
+
+  def execute() = {
+    // TODO: Clean up after ourselves?
+    val childResults = child.execute().map(_.copy()).cache()
+
+    val parent = childResults.mapPartitions { iter =>
+      val pickle = new Pickler
+      val currentRow = newMutableProjection(udf.children, child.output)()
+      iter.grouped(1000).map { inputRows =>
+        val toBePickled = inputRows.map(currentRow(_).toArray).toArray
+        pickle.dumps(toBePickled)
+      }
+    }
+
+    val pyRDD = new PythonRDD(
+      parent,
+      udf.command,
+      udf.envVars,
+      udf.pythonIncludes,
+      false,
+      udf.pythonExec,
+      Seq[Broadcast[Array[Byte]]](),
+      udf.accumulator
+    ).mapPartitions { iter =>
+      val pickle = new Unpickler
+      iter.flatMap { pickedResult =>
+        val unpickledBatch = pickle.loads(pickedResult)
+        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]]
+      }
+    }.mapPartitions { iter =>
+      val row = new GenericMutableRow(1)
+      iter.map { result =>
+        row(0) = udf.dataType match {
+          case StringType => result.toString
+          case other => result
+        }
+        row: Row
+      }
+    }
+
+    childResults.zip(pyRDD).mapPartitions { iter =>
+      val joinedRow = new JoinedRow()
+      iter.map {
+        case (row, udfResult) =>
+          joinedRow(row, udfResult)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 70db1ebd3a3e1..a3d2a1c7a51f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 
 private[sql] object JsonRDD extends Logging {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
index 0995a4eb6299f..f513eae9c2d13 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
@@ -32,8 +32,6 @@ import org.apache.spark.annotation.DeveloperApi
  */
 package object sql {
 
-  protected[sql] type Logging = com.typesafe.scalalogging.slf4j.Logging
-
   /**
    * :: DeveloperApi ::
    *
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
new file mode 100644
index 0000000000000..a9a11285def54
--- /dev/null
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+import org.apache.spark.sql.api.java.UDF1;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runners.Suite;
+import org.junit.runner.RunWith;
+
+import org.apache.spark.api.java.JavaSparkContext;
+
+// The test suite itself is Serializable so that anonymous Function implementations can be
+// serialized, as an alternative to converting these anonymous classes to static inner classes;
+// see http://stackoverflow.com/questions/758570/.
+public class JavaAPISuite implements Serializable {
+  private transient JavaSparkContext sc;
+  private transient JavaSQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaAPISuite");
+    sqlContext = new JavaSQLContext(sc);
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf1Test() {
+    // With Java 8 lambdas:
+    // sqlContext.registerFunction(
+    //   "stringLengthTest", (String str) -> str.length(), DataType.IntegerType);
+
+    sqlContext.registerFunction("stringLengthTest", new UDF1<String, Integer>() {
+      @Override
+      public Integer call(String str) throws Exception {
+        return str.length();
+      }
+    }, DataType.IntegerType);
+
+    // TODO: Why do we need this cast?
+    Row result = (Row) sqlContext.sql("SELECT stringLengthTest('test')").first();
+    assert(result.getInt(0) == 4);
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf2Test() {
+    // With Java 8 lambdas:
+    // sqlContext.registerFunction(
+    //   "stringLengthTest",
+    //   (String str1, String str2) -> str1.length() + str2.length,
+    //   DataType.IntegerType);
+
+    sqlContext.registerFunction("stringLengthTest", new UDF2<String, String, Integer>() {
+      @Override
+      public Integer call(String str1, String str2) throws Exception {
+        return str1.length() + str2.length();
+      }
+    }, DataType.IntegerType);
+
+    // TODO: Why do we need this cast?
+    Row result = (Row) sqlContext.sql("SELECT stringLengthTest('test', 'test2')").first();
+    assert(result.getInt(0) == 9);
+  }
+}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
index 3c92906d82864..33e5020bc636a 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
@@ -98,7 +98,7 @@ public Row call(Person person) throws Exception {
     StructType schema = DataType.createStructType(fields);
 
     JavaSchemaRDD schemaRDD = javaSqlCtx.applySchema(rowRDD, schema);
-    schemaRDD.registerAsTable("people");
+    schemaRDD.registerTempTable("people");
     List<Row> actual = javaSqlCtx.sql("SELECT * FROM people").collect();
 
     List<Row> expected = new ArrayList<Row>(2);
@@ -149,14 +149,14 @@ public void applySchemaToJSON() {
     JavaSchemaRDD schemaRDD1 = javaSqlCtx.jsonRDD(jsonRDD);
     StructType actualSchema1 = schemaRDD1.schema();
     Assert.assertEquals(expectedSchema, actualSchema1);
-    schemaRDD1.registerAsTable("jsonTable1");
+    schemaRDD1.registerTempTable("jsonTable1");
     List<Row> actual1 = javaSqlCtx.sql("select * from jsonTable1").collect();
     Assert.assertEquals(expectedResult, actual1);
 
     JavaSchemaRDD schemaRDD2 = javaSqlCtx.jsonRDD(jsonRDD, expectedSchema);
     StructType actualSchema2 = schemaRDD2.schema();
     Assert.assertEquals(expectedSchema, actualSchema2);
-    schemaRDD1.registerAsTable("jsonTable2");
+    schemaRDD1.registerTempTable("jsonTable2");
     List<Row> actual2 = javaSqlCtx.sql("select * from jsonTable2").collect();
     Assert.assertEquals(expectedResult, actual2);
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index c3c0dcb1aa00b..fbf9bd9dbcdea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -78,7 +78,7 @@ class CachedTableSuite extends QueryTest {
   }
 
   test("SELECT Star Cached Table") {
-    TestSQLContext.sql("SELECT * FROM testData").registerAsTable("selectStar")
+    TestSQLContext.sql("SELECT * FROM testData").registerTempTable("selectStar")
     TestSQLContext.cacheTable("selectStar")
     TestSQLContext.sql("SELECT * FROM selectStar WHERE key = 1").collect()
     TestSQLContext.uncacheTable("selectStar")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
index 4f0b85f26254b..c87d762751e6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/InsertIntoSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.io.File
+import _root_.java.io.File
 
 /* Implicits */
 import org.apache.spark.sql.test.TestSQLContext._
@@ -31,7 +31,7 @@ class InsertIntoSuite extends QueryTest {
     testFilePath.delete()
     testFilePath.deleteOnExit()
     val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-    testFile.registerAsTable("createAndInsertTest")
+    testFile.registerTempTable("createAndInsertTest")
 
     // Add some data.
     testData.insertInto("createAndInsertTest")
@@ -86,7 +86,7 @@ class InsertIntoSuite extends QueryTest {
     testFilePath.delete()
     testFilePath.deleteOnExit()
     val testFile = createParquetFile[TestData](testFilePath.getCanonicalPath)
-    testFile.registerAsTable("createAndInsertSQLTest")
+    testFile.registerTempTable("createAndInsertSQLTest")
 
     sql("INSERT INTO createAndInsertSQLTest SELECT * FROM testData")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 2fc80588182d9..6c7697ece8c56 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -285,8 +285,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
   }
 
   test("full outer join") {
-    upperCaseData.where('N <= 4).registerAsTable("left")
-    upperCaseData.where('N >= 3).registerAsTable("right")
+    upperCaseData.where('N <= 4).registerTempTable("left")
+    upperCaseData.where('N >= 3).registerTempTable("right")
 
     val left = UnresolvedRelation(None, "left", None)
     val right = UnresolvedRelation(None, "right", None)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5c571d35d1bb9..9b2a36d33fca7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -461,7 +461,7 @@ class SQLQuerySuite extends QueryTest {
     }
 
     val schemaRDD1 = applySchema(rowRDD1, schema1)
-    schemaRDD1.registerAsTable("applySchema1")
+    schemaRDD1.registerTempTable("applySchema1")
     checkAnswer(
       sql("SELECT * FROM applySchema1"),
       (1, "A1", true, null) ::
@@ -491,7 +491,7 @@ class SQLQuerySuite extends QueryTest {
     }
 
     val schemaRDD2 = applySchema(rowRDD2, schema2)
-    schemaRDD2.registerAsTable("applySchema2")
+    schemaRDD2.registerTempTable("applySchema2")
     checkAnswer(
       sql("SELECT * FROM applySchema2"),
       (Seq(1, true), Map("A1" -> null)) ::
@@ -516,7 +516,7 @@ class SQLQuerySuite extends QueryTest {
     }
 
     val schemaRDD3 = applySchema(rowRDD3, schema2)
-    schemaRDD3.registerAsTable("applySchema3")
+    schemaRDD3.registerTempTable("applySchema3")
 
     checkAnswer(
       sql("SELECT f1.f11, f2['D4'] FROM applySchema3"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index f2934da9a031d..5b84c658db942 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -61,7 +61,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
                            BigDecimal(1), new Timestamp(12345), Seq(1,2,3))
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerAsTable("reflectData")
+    rdd.registerTempTable("reflectData")
 
     assert(sql("SELECT * FROM reflectData").collect().head === data.productIterator.toSeq)
   }
@@ -69,7 +69,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with nulls") {
     val data = NullReflectData(null, null, null, null, null, null, null)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerAsTable("reflectNullData")
+    rdd.registerTempTable("reflectNullData")
 
     assert(sql("SELECT * FROM reflectNullData").collect().head === Seq.fill(7)(null))
   }
@@ -77,7 +77,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with Nones") {
     val data = OptionalReflectData(None, None, None, None, None, None, None)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerAsTable("reflectOptionalData")
+    rdd.registerTempTable("reflectOptionalData")
 
     assert(sql("SELECT * FROM reflectOptionalData").collect().head === Seq.fill(7)(null))
   }
@@ -85,7 +85,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   // Equality is broken for Arrays, so we test that separately.
   test("query binary data") {
     val rdd = sparkContext.parallelize(ReflectBinary(Array[Byte](1)) :: Nil)
-    rdd.registerAsTable("reflectBinary")
+    rdd.registerTempTable("reflectBinary")
 
     val result = sql("SELECT data FROM reflectBinary").collect().head(0).asInstanceOf[Array[Byte]]
     assert(result.toSeq === Seq[Byte](1))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 58cee21e8ad4c..c3ec82fb69778 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -17,18 +17,20 @@
 
 package org.apache.spark.sql
 
+import java.sql.Timestamp
+
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.test._
 
 /* Implicits */
-import TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext._
 
 case class TestData(key: Int, value: String)
 
 object TestData {
   val testData: SchemaRDD = TestSQLContext.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString)))
-  testData.registerAsTable("testData")
+  testData.registerTempTable("testData")
 
   case class LargeAndSmallInts(a: Int, b: Int)
   val largeAndSmallInts: SchemaRDD =
@@ -39,8 +41,8 @@ object TestData {
       LargeAndSmallInts(2, 2) ::
       LargeAndSmallInts(2147483646, 1) ::
       LargeAndSmallInts(3, 2) :: Nil)
-  largeAndSmallInts.registerAsTable("largeAndSmallInts")
-  
+  largeAndSmallInts.registerTempTable("largeAndSmallInts")
+
   case class TestData2(a: Int, b: Int)
   val testData2: SchemaRDD =
     TestSQLContext.sparkContext.parallelize(
@@ -50,7 +52,7 @@ object TestData {
       TestData2(2, 2) ::
       TestData2(3, 1) ::
       TestData2(3, 2) :: Nil)
-  testData2.registerAsTable("testData2")
+  testData2.registerTempTable("testData2")
 
   // TODO: There is no way to express null primitives as case classes currently...
   val testData3 =
@@ -69,7 +71,7 @@ object TestData {
       UpperCaseData(4, "D") ::
       UpperCaseData(5, "E") ::
       UpperCaseData(6, "F") :: Nil)
-  upperCaseData.registerAsTable("upperCaseData")
+  upperCaseData.registerTempTable("upperCaseData")
 
   case class LowerCaseData(n: Int, l: String)
   val lowerCaseData =
@@ -78,14 +80,14 @@ object TestData {
       LowerCaseData(2, "b") ::
       LowerCaseData(3, "c") ::
       LowerCaseData(4, "d") :: Nil)
-  lowerCaseData.registerAsTable("lowerCaseData")
+  lowerCaseData.registerTempTable("lowerCaseData")
 
   case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]])
   val arrayData =
     TestSQLContext.sparkContext.parallelize(
       ArrayData(Seq(1,2,3), Seq(Seq(1,2,3))) ::
       ArrayData(Seq(2,3,4), Seq(Seq(2,3,4))) :: Nil)
-  arrayData.registerAsTable("arrayData")
+  arrayData.registerTempTable("arrayData")
 
   case class MapData(data: Map[Int, String])
   val mapData =
@@ -95,18 +97,18 @@ object TestData {
       MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
       MapData(Map(1 -> "a4", 2 -> "b4")) ::
       MapData(Map(1 -> "a5")) :: Nil)
-  mapData.registerAsTable("mapData")
+  mapData.registerTempTable("mapData")
 
   case class StringData(s: String)
   val repeatedData =
     TestSQLContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
-  repeatedData.registerAsTable("repeatedData")
+  repeatedData.registerTempTable("repeatedData")
 
   val nullableRepeatedData =
     TestSQLContext.sparkContext.parallelize(
       List.fill(2)(StringData(null)) ++
       List.fill(2)(StringData("test")))
-  nullableRepeatedData.registerAsTable("nullableRepeatedData")
+  nullableRepeatedData.registerTempTable("nullableRepeatedData")
 
   case class NullInts(a: Integer)
   val nullInts =
@@ -116,7 +118,7 @@ object TestData {
       NullInts(3) ::
       NullInts(null) :: Nil
     )
-  nullInts.registerAsTable("nullInts")
+  nullInts.registerTempTable("nullInts")
 
   val allNulls =
     TestSQLContext.sparkContext.parallelize(
@@ -124,7 +126,7 @@ object TestData {
       NullInts(null) ::
       NullInts(null) ::
       NullInts(null) :: Nil)
-  allNulls.registerAsTable("allNulls")
+  allNulls.registerTempTable("allNulls")
 
   case class NullStrings(n: Int, s: String)
   val nullStrings =
@@ -132,10 +134,10 @@ object TestData {
       NullStrings(1, "abc") ::
       NullStrings(2, "ABC") ::
       NullStrings(3, null) :: Nil)
-  nullStrings.registerAsTable("nullStrings")
+  nullStrings.registerTempTable("nullStrings")
 
   case class TableName(tableName: String)
-  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).registerAsTable("tableName")
+  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).registerTempTable("tableName")
 
   val unparsedStrings =
     TestSQLContext.sparkContext.parallelize(
@@ -143,4 +145,10 @@ object TestData {
       "2, B2, false, null" ::
       "3, C3, true, null" ::
       "4, D4, true, 2147483644" :: Nil)
+
+  case class TimestampField(time: Timestamp)
+  val timestamps = TestSQLContext.sparkContext.parallelize((1 to 3).map { i =>
+    TimestampField(new Timestamp(i))
+  })
+  timestamps.registerTempTable("timestamps")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
new file mode 100644
index 0000000000000..76aa9b0081d7e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test._
+
+/* Implicits */
+import TestSQLContext._
+
+class UDFSuite extends QueryTest {
+
+  test("Simple UDF") {
+    registerFunction("strLenScala", (_: String).length)
+    assert(sql("SELECT strLenScala('test')").first().getInt(0) === 4)
+  }
+
+  test("TwoArgument UDF") {
+    registerFunction("strLenScala", (_: String).length + (_:Int))
+    assert(sql("SELECT strLenScala('test', 1)").first().getInt(0) === 5)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
index 020baf0c7ec6f..203ff847e94cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/java/JavaSQLSuite.scala
@@ -59,7 +59,7 @@ class JavaSQLSuite extends FunSuite {
     val rdd = javaCtx.parallelize(person :: Nil)
     val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[PersonBean])
 
-    schemaRDD.registerAsTable("people")
+    schemaRDD.registerTempTable("people")
     javaSqlCtx.sql("SELECT * FROM people").collect()
   }
 
@@ -76,7 +76,7 @@ class JavaSQLSuite extends FunSuite {
 
     val rdd = javaCtx.parallelize(bean :: Nil)
     val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerAsTable("allTypes")
+    schemaRDD.registerTempTable("allTypes")
 
     assert(
       javaSqlCtx.sql(
@@ -101,7 +101,7 @@ class JavaSQLSuite extends FunSuite {
 
     val rdd = javaCtx.parallelize(bean :: Nil)
     val schemaRDD = javaSqlCtx.applySchema(rdd, classOf[AllTypesBean])
-    schemaRDD.registerAsTable("allTypes")
+    schemaRDD.registerTempTable("allTypes")
 
     assert(
       javaSqlCtx.sql(
@@ -127,7 +127,7 @@ class JavaSQLSuite extends FunSuite {
 
     var schemaRDD = javaSqlCtx.jsonRDD(rdd)
 
-    schemaRDD.registerAsTable("jsonTable1")
+    schemaRDD.registerTempTable("jsonTable1")
 
     assert(
       javaSqlCtx.sql("select * from jsonTable1").collect.head.row ===
@@ -144,7 +144,7 @@ class JavaSQLSuite extends FunSuite {
     rdd.saveAsTextFile(path)
     schemaRDD = javaSqlCtx.jsonFile(path)
 
-    schemaRDD.registerAsTable("jsonTable2")
+    schemaRDD.registerTempTable("jsonTable2")
 
     assert(
       javaSqlCtx.sql("select * from jsonTable2").collect.head.row ===
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 829342215e691..75f653f3280bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -22,7 +22,7 @@ import java.sql.Timestamp
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.execution.SparkSqlSerializer
@@ -166,7 +166,7 @@ class ColumnTypeSuite extends FunSuite with Logging {
 
       buffer.rewind()
       seq.foreach { expected =>
-        logger.info("buffer = " + buffer + ", expected = " + expected)
+        logInfo("buffer = " + buffer + ", expected = " + expected)
         val extracted = columnType.extract(buffer)
         assert(
           expected === extracted,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 86727b93f3659..b561b44ad7ee2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -73,4 +73,16 @@ class InMemoryColumnarQuerySuite extends QueryTest {
       sql("SELECT * FROM nullableRepeatedData"),
       nullableRepeatedData.collect().toSeq)
   }
+
+  test("SPARK-2729 regression: timestamp data type") {
+    checkAnswer(
+      sql("SELECT time FROM timestamps"),
+      timestamps.collect().toSeq)
+
+    TestSQLContext.cacheTable("timestamps")
+
+    checkAnswer(
+      sql("SELECT time FROM timestamps"),
+      timestamps.collect().toSeq)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 9d9cfdd7c92e3..75c0589eb208e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -183,7 +183,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -223,7 +223,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     // Access elements of a primitive array.
     checkAnswer(
@@ -291,7 +291,7 @@ class JsonSuite extends QueryTest {
 
   ignore("Complex field and type inferring (Ignored)") {
     val jsonSchemaRDD = jsonRDD(complexFieldAndType)
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     // Right now, "field1" and "field2" are treated as aliases. We should fix it.
     checkAnswer(
@@ -320,7 +320,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -374,7 +374,7 @@ class JsonSuite extends QueryTest {
 
   ignore("Type conflict in primitive field values (Ignored)") {
     val jsonSchemaRDD = jsonRDD(primitiveFieldValueTypeConflict)
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     // Right now, the analyzer does not promote strings in a boolean expreesion.
     // Number and Boolean conflict: resolve the type as boolean in this query.
@@ -445,7 +445,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -466,7 +466,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -494,7 +494,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
   }
 
   test("Loading a JSON dataset from a text file") {
@@ -514,7 +514,7 @@ class JsonSuite extends QueryTest {
 
     assert(expectedSchema === jsonSchemaRDD.schema)
 
-    jsonSchemaRDD.registerAsTable("jsonTable")
+    jsonSchemaRDD.registerTempTable("jsonTable")
 
     checkAnswer(
       sql("select * from jsonTable"),
@@ -546,7 +546,7 @@ class JsonSuite extends QueryTest {
 
     assert(schema === jsonSchemaRDD1.schema)
 
-    jsonSchemaRDD1.registerAsTable("jsonTable1")
+    jsonSchemaRDD1.registerTempTable("jsonTable1")
 
     checkAnswer(
       sql("select * from jsonTable1"),
@@ -563,7 +563,7 @@ class JsonSuite extends QueryTest {
 
     assert(schema === jsonSchemaRDD2.schema)
 
-    jsonSchemaRDD2.registerAsTable("jsonTable2")
+    jsonSchemaRDD2.registerTempTable("jsonTable2")
 
     checkAnswer(
       sql("select * from jsonTable2"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 8955455ec98c7..9933575038bd3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -101,9 +101,9 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     ParquetTestData.writeNestedFile3()
     ParquetTestData.writeNestedFile4()
     testRDD = parquetFile(ParquetTestData.testDir.toString)
-    testRDD.registerAsTable("testsource")
+    testRDD.registerTempTable("testsource")
     parquetFile(ParquetTestData.testFilterDir.toString)
-      .registerAsTable("testfiltersource")
+      .registerTempTable("testfiltersource")
   }
 
   override def afterAll() {
@@ -247,7 +247,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   test("Creating case class RDD table") {
     TestSQLContext.sparkContext.parallelize((1 to 100))
       .map(i => TestRDDEntry(i, s"val_$i"))
-      .registerAsTable("tmp")
+      .registerTempTable("tmp")
     val rdd = sql("SELECT * FROM tmp").collect().sortBy(_.getInt(0))
     var counter = 1
     rdd.foreach {
@@ -266,7 +266,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
       .map(i => TestRDDEntry(i, s"val_$i"))
     rdd.saveAsParquetFile(path)
     val readFile = parquetFile(path)
-    readFile.registerAsTable("tmpx")
+    readFile.registerTempTable("tmpx")
     val rdd_copy = sql("SELECT * FROM tmpx").collect()
     val rdd_orig = rdd.collect()
     for(i <- 0 to 99) {
@@ -280,9 +280,9 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val dirname = Utils.createTempDir()
     val source_rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
       .map(i => TestRDDEntry(i, s"val_$i"))
-    source_rdd.registerAsTable("source")
+    source_rdd.registerTempTable("source")
     val dest_rdd = createParquetFile[TestRDDEntry](dirname.toString)
-    dest_rdd.registerAsTable("dest")
+    dest_rdd.registerTempTable("dest")
     sql("INSERT OVERWRITE INTO dest SELECT * FROM source").collect()
     val rdd_copy1 = sql("SELECT * FROM dest").collect()
     assert(rdd_copy1.size === 100)
@@ -547,7 +547,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir1.toString)
       .toSchemaRDD
-    data.registerAsTable("data")
+    data.registerTempTable("data")
     val query = nestedParserSqlContext.sql("SELECT owner, contacts[1].name FROM data")
     val tmp = query.collect()
     assert(tmp.size === 2)
@@ -562,7 +562,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir2.toString)
       .toSchemaRDD
-    data.registerAsTable("data")
+    data.registerTempTable("data")
     val result1 = nestedParserSqlContext.sql("SELECT entries[0].value FROM data").collect()
     assert(result1.size === 1)
     assert(result1(0).size === 1)
@@ -589,7 +589,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir3.toString)
       .toSchemaRDD
-    data.registerAsTable("data")
+    data.registerTempTable("data")
     val result1 = nestedParserSqlContext.sql("SELECT booleanNumberPairs[0].value[0].truth FROM data").collect()
     assert(result1.size === 1)
     assert(result1(0).size === 1)
@@ -608,7 +608,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = TestSQLContext
       .parquetFile(ParquetTestData.testNestedDir4.toString)
       .toSchemaRDD
-    data.registerAsTable("mapTable")
+    data.registerTempTable("mapTable")
     val result1 = sql("SELECT data1 FROM mapTable").collect()
     assert(result1.size === 1)
     assert(result1(0)(0)
@@ -625,7 +625,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     val data = nestedParserSqlContext
       .parquetFile(ParquetTestData.testNestedDir4.toString)
       .toSchemaRDD
-    data.registerAsTable("mapTable")
+    data.registerTempTable("mapTable")
     val result1 = nestedParserSqlContext.sql("SELECT data2 FROM mapTable").collect()
     assert(result1.size === 1)
     val entry1 = result1(0)(0)
@@ -658,7 +658,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     nestedParserSqlContext
       .parquetFile(tmpdir.toString)
       .toSchemaRDD
-      .registerAsTable("tmpcopy")
+      .registerTempTable("tmpcopy")
     val tmpdata = nestedParserSqlContext.sql("SELECT owner, contacts[1].name FROM tmpcopy").collect()
     assert(tmpdata.size === 2)
     assert(tmpdata(0).size === 2)
@@ -679,7 +679,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     nestedParserSqlContext
       .parquetFile(tmpdir.toString)
       .toSchemaRDD
-      .registerAsTable("tmpmapcopy")
+      .registerTempTable("tmpmapcopy")
     val result1 = nestedParserSqlContext.sql("""SELECT data1["key2"] FROM tmpmapcopy""").collect()
     assert(result1.size === 1)
     assert(result1(0)(0) === 2)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index ddbc2a79fb512..08d3f983d9e71 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hive.service.cli.thrift.ThriftBinaryCLIService
 import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor}
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
 
@@ -40,7 +40,7 @@ private[hive] object HiveThriftServer2 extends Logging {
     val optionsProcessor = new ServerOptionsProcessor("HiveThriftServer2")
 
     if (!optionsProcessor.process(args)) {
-      logger.warn("Error starting HiveThriftServer2 with given arguments")
+      logWarning("Error starting HiveThriftServer2 with given arguments")
       System.exit(-1)
     }
 
@@ -49,12 +49,12 @@ private[hive] object HiveThriftServer2 extends Logging {
     // Set all properties specified via command line.
     val hiveConf: HiveConf = ss.getConf
     hiveConf.getAllProperties.toSeq.sortBy(_._1).foreach { case (k, v) =>
-      logger.debug(s"HiveConf var: $k=$v")
+      logDebug(s"HiveConf var: $k=$v")
     }
 
     SessionState.start(ss)
 
-    logger.info("Starting SparkContext")
+    logInfo("Starting SparkContext")
     SparkSQLEnv.init()
     SessionState.start(ss)
 
@@ -70,10 +70,10 @@ private[hive] object HiveThriftServer2 extends Logging {
       val server = new HiveThriftServer2(SparkSQLEnv.hiveContext)
       server.init(hiveConf)
       server.start()
-      logger.info("HiveThriftServer2 started")
+      logInfo("HiveThriftServer2 started")
     } catch {
       case e: Exception =>
-        logger.error("Error starting HiveThriftServer2", e)
+        logError("Error starting HiveThriftServer2", e)
         System.exit(-1)
     }
   }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index cb17d7ce58ea0..4d0c506c5a397 100755
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -37,7 +37,7 @@ import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.shims.ShimLoader
 import org.apache.thrift.transport.TSocket
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 
 private[hive] object SparkSQLCLIDriver {
   private var prompt = "spark-sql"
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index a56b19a4bcda0..d362d599d08ca 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
 import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 
 private[hive] class SparkSQLDriver(val context: HiveContext = SparkSQLEnv.hiveContext)
@@ -40,7 +40,7 @@ private[hive] class SparkSQLDriver(val context: HiveContext = SparkSQLEnv.hiveCo
 
   private def getResultSetSchema(query: context.QueryExecution): Schema = {
     val analyzed = query.analyzed
-    logger.debug(s"Result Schema: ${analyzed.output}")
+    logDebug(s"Result Schema: ${analyzed.output}")
     if (analyzed.output.size == 0) {
       new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
     } else {
@@ -61,7 +61,7 @@ private[hive] class SparkSQLDriver(val context: HiveContext = SparkSQLEnv.hiveCo
       new CommandProcessorResponse(0)
     } catch {
       case cause: Throwable =>
-        logger.error(s"Failed in [$command]", cause)
+        logError(s"Failed in [$command]", cause)
         new CommandProcessorResponse(-3, ExceptionUtils.getFullStackTrace(cause), null)
     }
   }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
index 451c3bd7b9352..582264eb59f83 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala
@@ -20,13 +20,13 @@ package org.apache.spark.sql.hive.thriftserver
 import org.apache.hadoop.hive.ql.session.SessionState
 
 import org.apache.spark.scheduler.{SplitInfo, StatsReportListener}
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.{SparkConf, SparkContext}
 
 /** A singleton object for the master program. The slaves should not access this. */
 private[hive] object SparkSQLEnv extends Logging {
-  logger.debug("Initializing SparkSQLEnv")
+  logDebug("Initializing SparkSQLEnv")
 
   var hiveContext: HiveContext = _
   var sparkContext: SparkContext = _
@@ -47,7 +47,7 @@ private[hive] object SparkSQLEnv extends Logging {
 
   /** Cleans up and shuts down the Spark SQL environments. */
   def stop() {
-    logger.debug("Shutting down Spark SQL Environment")
+    logDebug("Shutting down Spark SQL Environment")
     // Stop the SparkContext
     if (SparkSQLEnv.sparkContext != null) {
       sparkContext.stop()
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
index a4e1f3e762e89..d4dadfd21d13f 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -30,10 +30,11 @@ import org.apache.hive.service.cli._
 import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
 import org.apache.hive.service.cli.session.HiveSession
 
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.hive.thriftserver.ReflectionUtils
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
-import org.apache.spark.sql.{Logging, SchemaRDD, Row => SparkRow}
+import org.apache.spark.sql.{SchemaRDD, Row => SparkRow}
 
 /**
  * Executes queries using Spark SQL, and maintains a list of handles to active queries.
@@ -55,7 +56,7 @@ class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManage
 
       def close(): Unit = {
         // RDDs will be cleaned automatically upon garbage collection.
-        logger.debug("CLOSING")
+        logDebug("CLOSING")
       }
 
       def getNextRowSet(order: FetchOrientation, maxRowsL: Long): RowSet = {
@@ -112,7 +113,7 @@ class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManage
       }
 
       def getResultSetSchema: TableSchema = {
-        logger.warn(s"Result Schema: ${result.queryExecution.analyzed.output}")
+        logWarning(s"Result Schema: ${result.queryExecution.analyzed.output}")
         if (result.queryExecution.analyzed.output.size == 0) {
           new TableSchema(new FieldSchema("Result", "string", "") :: Nil)
         } else {
@@ -124,11 +125,11 @@ class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManage
       }
 
       def run(): Unit = {
-        logger.info(s"Running query '$statement'")
+        logInfo(s"Running query '$statement'")
         setState(OperationState.RUNNING)
         try {
           result = hiveContext.hql(statement)
-          logger.debug(result.queryExecution.toString())
+          logDebug(result.queryExecution.toString())
           val groupId = round(random * 1000000).toString
           hiveContext.sparkContext.setJobGroup(groupId, statement)
           iter = result.queryExecution.toRdd.toLocalIterator
@@ -138,7 +139,7 @@ class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManage
           // Actually do need to catch Throwable as some failures don't inherit from Exception and
           // HiveServer will silently swallow them.
           case e: Throwable =>
-            logger.error("Error executing query:",e)
+            logError("Error executing query:",e)
             throw new HiveSQLException(e.toString)
         }
         setState(OperationState.FINISHED)
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
index fe3403b3292ec..b7b7c9957ac34 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
@@ -27,7 +27,7 @@ import java.sql.{Connection, DriverManager, Statement}
 
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.util.getTempFilePath
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 7e3b8727bebed..3c70b3f0921a5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -23,7 +23,7 @@ import java.util.{ArrayList => JArrayList}
 
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
-import scala.reflect.runtime.universe.TypeTag
+import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Driver
@@ -35,8 +35,9 @@ import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.analysis.{Analyzer, OverrideCatalog}
+import org.apache.spark.sql.catalyst.analysis.{OverrideFunctionRegistry, Analyzer, OverrideCatalog}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.ExtractPythonUdfs
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.execution.{Command => PhysicalCommand}
 import org.apache.spark.sql.hive.execution.DescribeHiveTableCommand
@@ -155,10 +156,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     }
   }
 
+  // Note that HiveUDFs will be overridden by functions registered in this context.
+  override protected[sql] lazy val functionRegistry =
+    new HiveFunctionRegistry with OverrideFunctionRegistry
+
   /* An analyzer that uses the Hive metastore. */
   @transient
   override protected[sql] lazy val analyzer =
-    new Analyzer(catalog, HiveFunctionRegistry, caseSensitive = false)
+    new Analyzer(catalog, functionRegistry, caseSensitive = false)
 
   /**
    * Runs the specified SQL query using Hive.
@@ -207,7 +212,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       }
     } catch {
       case e: Exception =>
-        logger.error(
+        logError(
           s"""
             |======================
             |HIVE FAILURE OUTPUT
@@ -250,7 +255,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   protected[sql] abstract class QueryExecution extends super.QueryExecution {
     // TODO: Create mixin for the analyzer instead of overriding things here.
     override lazy val optimizedPlan =
-      optimizer(catalog.PreInsertionCasts(catalog.CreateTables(analyzed)))
+      optimizer(ExtractPythonUdfs(catalog.PreInsertionCasts(catalog.CreateTables(analyzed))))
 
     override lazy val toRdd: RDD[Row] = executedPlan.execute().map(_.copy())
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index fa4e78439c26c..df3604439e483 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -28,7 +28,8 @@ import org.apache.hadoop.hive.ql.plan.TableDesc
 import org.apache.hadoop.hive.serde2.Deserializer
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.{SQLContext, Logging}
+import org.apache.spark.Logging
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.analysis.{EliminateAnalysisOperators, Catalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 3d2eb1eefaeda..bc2fefafd58c8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -297,8 +297,11 @@ private[hive] object HiveQl {
       matches.headOption
     }
 
-    assert(remainingNodes.isEmpty,
-      s"Unhandled clauses: ${remainingNodes.map(dumpTree(_)).mkString("\n")}")
+    if (remainingNodes.nonEmpty) {
+      sys.error(
+        s"""Unhandled clauses: ${remainingNodes.map(dumpTree(_)).mkString("\n")}.
+           |You are likely trying to use an unsupported Hive feature."""".stripMargin)
+    }
     clauses
   }
 
@@ -748,7 +751,10 @@ private[hive] object HiveQl {
     case Token(allJoinTokens(joinToken),
            relation1 ::
            relation2 :: other) =>
-      assert(other.size <= 1, s"Unhandled join child $other")
+      if (!(other.size <= 1)) {
+        sys.error(s"Unsupported join operation: $other")
+      }
+
       val joinType = joinToken match {
         case "TOK_JOIN" => Inner
         case "TOK_RIGHTOUTERJOIN" => RightOuter
@@ -756,7 +762,6 @@ private[hive] object HiveQl {
         case "TOK_FULLOUTERJOIN" => FullOuter
         case "TOK_LEFTSEMIJOIN" => LeftSemi
       }
-      assert(other.size <= 1, "Unhandled join clauses.")
       Join(nodeToRelation(relation1),
         nodeToRelation(relation2),
         joinType,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index c50e8c4b5c5d3..c605e8adcfb0f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -148,7 +148,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
         describedTables ++
         logical.collect { case UnresolvedRelation(databaseName, name, _) => name }
       val referencedTestTables = referencedTables.filter(testTables.contains)
-      logger.debug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
+      logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
       referencedTestTables.foreach(loadTestTable)
       // Proceed with analysis.
       analyzer(logical)
@@ -273,7 +273,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
     if (!(loadedTables contains name)) {
       // Marks the table as loaded first to prevent infite mutually recursive table loading.
       loadedTables += name
-      logger.info(s"Loading test table $name")
+      logInfo(s"Loading test table $name")
       val createCmds =
         testTables.get(name).map(_.commands).getOrElse(sys.error(s"Unknown test table $name"))
       createCmds.foreach(_())
@@ -297,8 +297,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   def reset() {
     try {
       // HACK: Hive is too noisy by default.
-      org.apache.log4j.LogManager.getCurrentLoggers.foreach { logger =>
-        logger.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
+      org.apache.log4j.LogManager.getCurrentLoggers.foreach { log =>
+        log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN)
       }
 
       // It is important that we RESET first as broken hooks that might have been set could break
@@ -312,7 +312,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
 
       loadedTables.clear()
       catalog.client.getAllTables("default").foreach { t =>
-        logger.debug(s"Deleting table $t")
+        logDebug(s"Deleting table $t")
         val table = catalog.client.getTable("default", t)
 
         catalog.client.getIndexes("default", t, 255).foreach { index =>
@@ -325,7 +325,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       }
 
       catalog.client.getAllDatabases.filterNot(_ == "default").foreach { db =>
-        logger.debug(s"Dropping Database: $db")
+        logDebug(s"Dropping Database: $db")
         catalog.client.dropDatabase(db, true, false, true)
       }
 
@@ -347,7 +347,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       loadTestTable("srcpart")
     } catch {
       case e: Exception =>
-        logger.error(s"FATAL ERROR: Failed to reset TestDB state. $e")
+        logError(s"FATAL ERROR: Failed to reset TestDB state. $e")
         // At this point there is really no reason to continue, but the test framework traps exits.
         // So instead we just pause forever so that at least the developer can see where things
         // started to go wrong.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 7582b4743d404..179aac5cbd5cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.hive.ql.exec.{FunctionInfo, FunctionRegistry}
 import org.apache.hadoop.hive.ql.udf.{UDFType => HiveUDFType}
 import org.apache.hadoop.hive.ql.udf.generic._
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types._
@@ -34,7 +34,8 @@ import org.apache.spark.util.Utils.getContextOrSparkClassLoader
 /* Implicit conversions */
 import scala.collection.JavaConversions._
 
-private[hive] object HiveFunctionRegistry extends analysis.FunctionRegistry with HiveInspectors {
+private[hive] abstract class HiveFunctionRegistry
+  extends analysis.FunctionRegistry with HiveInspectors {
 
   def getFunctionInfo(name: String) = FunctionRegistry.getFunctionInfo(name)
 
@@ -92,9 +93,8 @@ private[hive] abstract class HiveUdf extends Expression with Logging with HiveFu
 }
 
 private[hive] case class HiveSimpleUdf(functionClassName: String, children: Seq[Expression])
-  extends HiveUdf {
+  extends HiveUdf with HiveInspectors {
 
-  import org.apache.spark.sql.hive.HiveFunctionRegistry._
   type UDFType = UDF
 
   @transient
@@ -119,7 +119,7 @@ private[hive] case class HiveSimpleUdf(functionClassName: String, children: Seq[
       sys.error(s"No matching wrapper found, options: ${argClass.getConstructors.toSeq}."))
 
     (a: Any) => {
-      logger.debug(
+      logDebug(
         s"Wrapping $a of type ${if (a == null) "null" else a.getClass.getName} using $constructor.")
       // We must make sure that primitives get boxed java style.
       if (a == null) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 11d8b1f0a3d96..95921c3d7ae09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -51,9 +51,9 @@ class QueryTest extends FunSuite {
         fail(
           s"""
             |Exception thrown while executing query:
-            |${rdd.logicalPlan}
+            |${rdd.queryExecution}
             |== Exception ==
-            |$e
+            |${stackTraceToString(e)}
           """.stripMargin)
     }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 833f3502154f3..7e323146f9da2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -28,7 +28,7 @@ case class TestData(key: Int, value: String)
 class InsertIntoHiveTableSuite extends QueryTest {
   val testData = TestHive.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString)))
-  testData.registerAsTable("testData")
+  testData.registerTempTable("testData")
 
   test("insertInto() HiveTable") {
     createTable[TestData]("createAndInsertTest")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
index 10c8069a624e6..578f27574ad2f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/api/java/JavaHiveQLSuite.scala
@@ -63,7 +63,7 @@ class JavaHiveQLSuite extends FunSuite {
       javaHiveCtx.hql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
     }
 
-    javaHiveCtx.hql("SHOW TABLES").registerAsTable("show_tables")
+    javaHiveCtx.hql("SHOW TABLES").registerTempTable("show_tables")
 
     assert(
       javaHiveCtx
@@ -73,7 +73,7 @@ class JavaHiveQLSuite extends FunSuite {
         .contains(tableName))
 
     assertResult(Array(Array("key", "int", "None"), Array("value", "string", "None"))) {
-      javaHiveCtx.hql(s"DESCRIBE $tableName").registerAsTable("describe_table")
+      javaHiveCtx.hql(s"DESCRIBE $tableName").registerTempTable("describe_table")
 
       javaHiveCtx
         .hql("SELECT result FROM describe_table")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 6c8fe4b196dea..83cfbc6b4a002 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -21,7 +21,7 @@ import java.io._
 
 import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen}
 
-import org.apache.spark.sql.Logging
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.{NativeCommand => LogicalNativeCommand}
@@ -197,7 +197,7 @@ abstract class HiveComparisonTest
     // If test sharding is enable, skip tests that are not in the correct shard.
     shardInfo.foreach {
       case (shardId, numShards) if testCaseName.hashCode % numShards != shardId => return
-      case (shardId, _) => logger.debug(s"Shard $shardId includes test '$testCaseName'")
+      case (shardId, _) => logDebug(s"Shard $shardId includes test '$testCaseName'")
     }
 
     // Skip tests found in directories specified by user.
@@ -213,13 +213,13 @@ abstract class HiveComparisonTest
         .map(new File(_, testCaseName))
         .filter(_.exists)
     if (runOnlyDirectories.nonEmpty && runIndicators.isEmpty) {
-      logger.debug(
+      logDebug(
         s"Skipping test '$testCaseName' not found in ${runOnlyDirectories.map(_.getCanonicalPath)}")
       return
     }
 
     test(testCaseName) {
-      logger.debug(s"=== HIVE TEST: $testCaseName ===")
+      logDebug(s"=== HIVE TEST: $testCaseName ===")
 
       // Clear old output for this testcase.
       outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
@@ -235,7 +235,7 @@ abstract class HiveComparisonTest
           .filterNot(_ contains "hive.outerjoin.supports.filters")
 
       if (allQueries != queryList)
-        logger.warn(s"Simplifications made on unsupported operations for test $testCaseName")
+        logWarning(s"Simplifications made on unsupported operations for test $testCaseName")
 
       lazy val consoleTestCase = {
         val quotes = "\"\"\""
@@ -257,11 +257,11 @@ abstract class HiveComparisonTest
         }
 
         val hiveCachedResults = hiveCacheFiles.flatMap { cachedAnswerFile =>
-          logger.debug(s"Looking for cached answer file $cachedAnswerFile.")
+          logDebug(s"Looking for cached answer file $cachedAnswerFile.")
           if (cachedAnswerFile.exists) {
             Some(fileToString(cachedAnswerFile))
           } else {
-            logger.debug(s"File $cachedAnswerFile not found")
+            logDebug(s"File $cachedAnswerFile not found")
             None
           }
         }.map {
@@ -272,7 +272,7 @@ abstract class HiveComparisonTest
 
         val hiveResults: Seq[Seq[String]] =
           if (hiveCachedResults.size == queryList.size) {
-            logger.info(s"Using answer cache for test: $testCaseName")
+            logInfo(s"Using answer cache for test: $testCaseName")
             hiveCachedResults
           } else {
 
@@ -287,7 +287,7 @@ abstract class HiveComparisonTest
                   if (installHooksCommand.findAllMatchIn(queryString).nonEmpty)
                     sys.error("hive exec hooks not supported for tests.")
 
-                  logger.warn(s"Running query ${i+1}/${queryList.size} with hive.")
+                  logWarning(s"Running query ${i+1}/${queryList.size} with hive.")
                   // Analyze the query with catalyst to ensure test tables are loaded.
                   val answer = hiveQuery.analyzed match {
                     case _: ExplainCommand => Nil // No need to execute EXPLAIN queries as we don't check the output.
@@ -351,7 +351,7 @@ abstract class HiveComparisonTest
               val resultComparison = sideBySide(hivePrintOut, catalystPrintOut).mkString("\n")
 
               if (recomputeCache) {
-                logger.warn(s"Clearing cache files for failed test $testCaseName")
+                logWarning(s"Clearing cache files for failed test $testCaseName")
                 hiveCacheFiles.foreach(_.delete())
               }
 
@@ -380,7 +380,7 @@ abstract class HiveComparisonTest
               TestHive.runSqlHive("SELECT key FROM src")
             } catch {
               case e: Exception =>
-                logger.error(s"FATAL ERROR: Canary query threw $e This implies that the testing environment has likely been corrupted.")
+                logError(s"FATAL ERROR: Canary query threw $e This implies that the testing environment has likely been corrupted.")
                 // The testing setup traps exits so wait here for a long time so the developer can see when things started
                 // to go wrong.
                 Thread.sleep(1000000)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
index 50ab71a9003d3..02518d516261b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
@@ -53,7 +53,7 @@ abstract class HiveQueryFileTest extends HiveComparisonTest {
   testCases.sorted.foreach {
     case (testCaseName, testCaseFile) =>
       if (blackList.map(_.r.pattern.matcher(testCaseName).matches()).reduceLeft(_||_)) {
-        logger.debug(s"Blacklisted test skipped $testCaseName")
+        logDebug(s"Blacklisted test skipped $testCaseName")
       } else if (realWhiteList.map(_.r.pattern.matcher(testCaseName).matches()).reduceLeft(_||_) || runAll) {
         // Build a test case and submit it to scala test framework...
         val queriesString = fileToString(testCaseFile)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 89cc589fb8001..4ed41550cf530 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -247,7 +247,7 @@ class HiveQuerySuite extends HiveComparisonTest {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(2, "str2") :: Nil)
-    testData.registerAsTable("REGisteredTABle")
+    testData.registerTempTable("REGisteredTABle")
 
     assertResult(Array(Array(2, "str2"))) {
       hql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " +
@@ -272,7 +272,7 @@ class HiveQuerySuite extends HiveComparisonTest {
   test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") {
     val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3))
       .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)}
-    TestHive.sparkContext.parallelize(fixture).registerAsTable("having_test")
+    TestHive.sparkContext.parallelize(fixture).registerTempTable("having_test")
     val results =
       hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3")
       .collect()
@@ -401,7 +401,7 @@ class HiveQuerySuite extends HiveComparisonTest {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(1, "str2") :: Nil)
-    testData.registerAsTable("test_describe_commands2")
+    testData.registerTempTable("test_describe_commands2")
 
     assertResult(
       Array(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index fb03db12a0b01..2455c18925dfa 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -54,14 +54,14 @@ class HiveResolutionSuite extends HiveComparisonTest {
   test("case insensitivity with scala reflection") {
     // Test resolution with Scala Reflection
     TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .registerAsTable("caseSensitivityTest")
+      .registerTempTable("caseSensitivityTest")
 
     hql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
   }
 
   test("nested repeated resolution") {
     TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .registerAsTable("nestedRepeatedTest")
+     .registerTempTable("nestedRepeatedTest")
     assert(hql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
index 47526e3596e44..6545e8d7dcb69 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
@@ -41,7 +41,7 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
     // write test data
     ParquetTestData.writeFile()
     testRDD = parquetFile(ParquetTestData.testDir.toString)
-    testRDD.registerAsTable("testsource")
+    testRDD.registerTempTable("testsource")
   }
 
   override def afterAll() {
@@ -67,7 +67,7 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
       .map(i => Cases(i, i))
       .saveAsParquetFile(tempFile.getCanonicalPath)
 
-    parquetFile(tempFile.getCanonicalPath).registerAsTable("cases")
+    parquetFile(tempFile.getCanonicalPath).registerTempTable("cases")
     hql("SELECT upper FROM cases").collect().map(_.getString(0)) === (1 to 10).map(_.toString)
     hql("SELECT LOWER FROM cases").collect().map(_.getString(0)) === (1 to 10).map(_.toString)
   }
@@ -86,7 +86,7 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
 
   test("Converting Hive to Parquet Table via saveAsParquetFile") {
     hql("SELECT * FROM src").saveAsParquetFile(dirname.getAbsolutePath)
-    parquetFile(dirname.getAbsolutePath).registerAsTable("ptable")
+    parquetFile(dirname.getAbsolutePath).registerTempTable("ptable")
     val rddOne = hql("SELECT * FROM src").collect().sortBy(_.getInt(0))
     val rddTwo = hql("SELECT * from ptable").collect().sortBy(_.getInt(0))
     compareRDDs(rddOne, rddTwo, "src (Hive)", Seq("key:Int", "value:String"))
@@ -94,7 +94,7 @@ class HiveParquetSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAft
 
   test("INSERT OVERWRITE TABLE Parquet table") {
     hql("SELECT * FROM testsource").saveAsParquetFile(dirname.getAbsolutePath)
-    parquetFile(dirname.getAbsolutePath).registerAsTable("ptable")
+    parquetFile(dirname.getAbsolutePath).registerTempTable("ptable")
     // let's do three overwrites for good measure
     hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()
     hql("INSERT OVERWRITE TABLE ptable SELECT * FROM testsource").collect()

From fe57ed080e0dcd4e9ca624360c91a3d96086460a Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 03:09:52 -0400
Subject: [PATCH 09/23] removing merge conflict backups

---
 python/pyspark/mllib/_common.py.orig | 572 ---------------------------
 python/pyspark/mllib/util.py.orig    | 211 ----------
 2 files changed, 783 deletions(-)
 delete mode 100644 python/pyspark/mllib/_common.py.orig
 delete mode 100644 python/pyspark/mllib/util.py.orig

diff --git a/python/pyspark/mllib/_common.py.orig b/python/pyspark/mllib/_common.py.orig
deleted file mode 100644
index e55270733f55b..0000000000000
--- a/python/pyspark/mllib/_common.py.orig
+++ /dev/null
@@ -1,572 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import struct
-import numpy
-from numpy import ndarray, float64, int64, int32, array_equal, array
-from pyspark import SparkContext, RDD
-from pyspark.mllib.linalg import SparseVector
-from pyspark.serializers import Serializer
-
-
-"""
-Common utilities shared throughout MLlib, primarily for dealing with
-different data types. These include:
-- Serialization utilities to / from byte arrays that Java can handle
-- Serializers for other data types, like ALS Rating objects
-- Common methods for linear models
-- Methods to deal with the different vector types we support, such as
-  SparseVector and scipy.sparse matrices.
-"""
-
-
-# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
-# such as _dot and _serialize_double_vector, start to support scipy.sparse
-# matrices.
-
-_have_scipy = False
-_scipy_issparse = None
-try:
-    import scipy.sparse
-    _have_scipy = True
-    _scipy_issparse = scipy.sparse.issparse
-except:
-    # No SciPy in environment, but that's okay
-    pass
-
-
-# Serialization functions to and from Scala. These use the following formats, understood
-# by the PythonMLLibAPI class in Scala:
-#
-# Dense double vector format:
-#
-# [1-byte 1] [4-byte length] [length*8 bytes of data]
-#
-# Sparse double vector format:
-#
-# [1-byte 2] [4-byte length] [4-byte nonzeros] [nonzeros*4 bytes of indices] \
-# [nonzeros*8 bytes of values]
-#
-# Double matrix format:
-#
-# [1-byte 3] [4-byte rows] [4-byte cols] [rows*cols*8 bytes of data]
-#
-# LabeledPoint format:
-#
-# [1-byte 4] [8-byte label] [dense or sparse vector]
-#
-# This is all in machine-endian.  That means that the Java interpreter and the
-# Python interpreter must agree on what endian the machine is.
-
-
-DENSE_VECTOR_MAGIC = 1
-SPARSE_VECTOR_MAGIC = 2
-DENSE_MATRIX_MAGIC = 3
-LABELED_POINT_MAGIC = 4
-
-
-def _deserialize_numpy_array(shape, ba, offset, dtype=float64):
-    """
-    Deserialize a numpy array of the given type from an offset in
-    bytearray ba, assigning it the given shape.
-
-    >>> x = array([1.0, 2.0, 3.0, 4.0, 5.0])
-    >>> array_equal(x, _deserialize_numpy_array(x.shape, x.data, 0))
-    True
-    >>> x = array([1.0, 2.0, 3.0, 4.0]).reshape(2,2)
-    >>> array_equal(x, _deserialize_numpy_array(x.shape, x.data, 0))
-    True
-    >>> x = array([1, 2, 3], dtype=int32)
-    >>> array_equal(x, _deserialize_numpy_array(x.shape, x.data, 0, dtype=int32))
-    True
-    """
-    ar = ndarray(shape=shape, buffer=ba, offset=offset, dtype=dtype, order='C')
-    return ar.copy()
-
-
-def _serialize_double(d):
-    """
-    Serialize a double (float or numpy.float64) into a mutually understood format.
-    """
-    if type(d) == float or type(d) == float64 or type(d) == int or type(d) == long:
-        d = float64(d)
-        ba = bytearray(8)
-        _copyto(d, buffer=ba, offset=0, shape=[1], dtype=float64)
-        return ba
-    else:
-        raise TypeError("_serialize_double called on non-float input")
-
-
-def _serialize_double_vector(v):
-    """
-    Serialize a double vector into a mutually understood format.
-
-    Note: we currently do not use a magic byte for double for storage
-    efficiency. This should be reconsidered when we add Ser/De for other
-    8-byte types (e.g. Long), for safety. The corresponding deserializer,
-    _deserialize_double, needs to be modified as well if the serialization
-    scheme changes.
-
-    >>> x = array([1,2,3])
-    >>> y = _deserialize_double_vector(_serialize_double_vector(x))
-    >>> array_equal(y, array([1.0, 2.0, 3.0]))
-    True
-    """
-    v = _convert_vector(v)
-    if type(v) == ndarray:
-        return _serialize_dense_vector(v)
-    elif type(v) == SparseVector:
-        return _serialize_sparse_vector(v)
-    else:
-        raise TypeError("_serialize_double_vector called on a %s; "
-                        "wanted ndarray or SparseVector" % type(v))
-
-
-def _serialize_dense_vector(v):
-    """Serialize a dense vector given as a NumPy array."""
-    if v.ndim != 1:
-        raise TypeError("_serialize_double_vector called on a %ddarray; "
-                        "wanted a 1darray" % v.ndim)
-    if v.dtype != float64:
-        if numpy.issubdtype(v.dtype, numpy.complex):
-            raise TypeError("_serialize_double_vector called on an ndarray of %s; "
-                            "wanted ndarray of float64" % v.dtype)
-        v = v.astype(float64)
-    length = v.shape[0]
-    ba = bytearray(5 + 8 * length)
-    ba[0] = DENSE_VECTOR_MAGIC
-    length_bytes = ndarray(shape=[1], buffer=ba, offset=1, dtype=int32)
-    length_bytes[0] = length
-    _copyto(v, buffer=ba, offset=5, shape=[length], dtype=float64)
-    return ba
-
-
-def _serialize_sparse_vector(v):
-    """Serialize a pyspark.mllib.linalg.SparseVector."""
-    nonzeros = len(v.indices)
-    ba = bytearray(9 + 12 * nonzeros)
-    ba[0] = SPARSE_VECTOR_MAGIC
-    header = ndarray(shape=[2], buffer=ba, offset=1, dtype=int32)
-    header[0] = v.size
-    header[1] = nonzeros
-    _copyto(v.indices, buffer=ba, offset=9, shape=[nonzeros], dtype=int32)
-    values_offset = 9 + 4 * nonzeros
-    _copyto(v.values, buffer=ba, offset=values_offset,
-            shape=[nonzeros], dtype=float64)
-    return ba
-
-
-def _deserialize_double(ba, offset=0):
-    """Deserialize a double from a mutually understood format.
-
-    >>> import sys
-    >>> _deserialize_double(_serialize_double(123.0)) == 123.0
-    True
-    >>> _deserialize_double(_serialize_double(float64(0.0))) == 0.0
-    True
-    >>> _deserialize_double(_serialize_double(1)) == 1.0
-    True
-    >>> _deserialize_double(_serialize_double(1L)) == 1.0
-    True
-    >>> x = sys.float_info.max
-    >>> _deserialize_double(_serialize_double(sys.float_info.max)) == x
-    True
-    >>> y = float64(sys.float_info.max)
-    >>> _deserialize_double(_serialize_double(sys.float_info.max)) == y
-    True
-    """
-    if type(ba) != bytearray:
-        raise TypeError(
-            "_deserialize_double called on a %s; wanted bytearray" % type(ba))
-    if len(ba) - offset != 8:
-        raise TypeError(
-            "_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
-    return struct.unpack("d", ba[offset:])[0]
-
-
-def _deserialize_double_vector(ba, offset=0):
-    """Deserialize a double vector from a mutually understood format.
-
-    >>> x = array([1.0, 2.0, 3.0, 4.0, -1.0, 0.0, -0.0])
-    >>> array_equal(x, _deserialize_double_vector(_serialize_double_vector(x)))
-    True
-    >>> s = SparseVector(4, [1, 3], [3.0, 5.5])
-    >>> s == _deserialize_double_vector(_serialize_double_vector(s))
-    True
-    """
-    if type(ba) != bytearray:
-        raise TypeError("_deserialize_double_vector called on a %s; "
-                        "wanted bytearray" % type(ba))
-    nb = len(ba) - offset
-    if nb < 5:
-        raise TypeError("_deserialize_double_vector called on a %d-byte array, "
-                        "which is too short" % nb)
-    if ba[offset] == DENSE_VECTOR_MAGIC:
-        return _deserialize_dense_vector(ba, offset)
-    elif ba[offset] == SPARSE_VECTOR_MAGIC:
-        return _deserialize_sparse_vector(ba, offset)
-    else:
-        raise TypeError("_deserialize_double_vector called on bytearray "
-                        "with wrong magic")
-
-
-def _deserialize_dense_vector(ba, offset=0):
-    """Deserialize a dense vector into a numpy array."""
-    nb = len(ba) - offset
-    if nb < 5:
-        raise TypeError("_deserialize_dense_vector called on a %d-byte array, "
-                        "which is too short" % nb)
-    length = ndarray(shape=[1], buffer=ba, offset=offset + 1, dtype=int32)[0]
-    if nb < 8 * length + 5:
-        raise TypeError("_deserialize_dense_vector called on bytearray "
-                        "with wrong length")
-    return _deserialize_numpy_array([length], ba, offset + 5)
-
-
-def _deserialize_sparse_vector(ba, offset=0):
-    """Deserialize a sparse vector into a MLlib SparseVector object."""
-    nb = len(ba) - offset
-    if nb < 9:
-        raise TypeError("_deserialize_sparse_vector called on a %d-byte array, "
-                        "which is too short" % nb)
-    header = ndarray(shape=[2], buffer=ba, offset=offset + 1, dtype=int32)
-    size = header[0]
-    nonzeros = header[1]
-    if nb < 9 + 12 * nonzeros:
-        raise TypeError("_deserialize_sparse_vector called on bytearray "
-                        "with wrong length")
-    indices = _deserialize_numpy_array([nonzeros], ba, offset + 9, dtype=int32)
-    values = _deserialize_numpy_array(
-        [nonzeros], ba, offset + 9 + 4 * nonzeros, dtype=float64)
-    return SparseVector(int(size), indices, values)
-
-
-def _serialize_double_matrix(m):
-    """Serialize a double matrix into a mutually understood format."""
-    if (type(m) == ndarray and m.ndim == 2):
-        if m.dtype != float64:
-            if numpy.issubdtype(m.dtype, numpy.complex):
-                raise TypeError("_serialize_double_matrix called on an ndarray of %s; "
-                                "wanted ndarray of float64" % m.dtype)
-            m = m.astype(float64)
-        rows = m.shape[0]
-        cols = m.shape[1]
-        ba = bytearray(9 + 8 * rows * cols)
-        ba[0] = DENSE_MATRIX_MAGIC
-        lengths = ndarray(shape=[3], buffer=ba, offset=1, dtype=int32)
-        lengths[0] = rows
-        lengths[1] = cols
-        _copyto(m, buffer=ba, offset=9, shape=[rows, cols], dtype=float64)
-        return ba
-    else:
-        raise TypeError("_serialize_double_matrix called on a "
-                        "non-double-matrix")
-
-
-def _deserialize_double_matrix(ba):
-    """Deserialize a double matrix from a mutually understood format."""
-    if type(ba) != bytearray:
-        raise TypeError("_deserialize_double_matrix called on a %s; "
-                        "wanted bytearray" % type(ba))
-    if len(ba) < 9:
-        raise TypeError("_deserialize_double_matrix called on a %d-byte array, "
-                        "which is too short" % len(ba))
-    if ba[0] != DENSE_MATRIX_MAGIC:
-        raise TypeError("_deserialize_double_matrix called on bytearray "
-                        "with wrong magic")
-    lengths = ndarray(shape=[2], buffer=ba, offset=1, dtype=int32)
-    rows = lengths[0]
-    cols = lengths[1]
-    if (len(ba) != 8 * rows * cols + 9):
-        raise TypeError("_deserialize_double_matrix called on bytearray "
-                        "with wrong length")
-    return _deserialize_numpy_array([rows, cols], ba, 9)
-
-
-def _serialize_labeled_point(p):
-    """
-    Serialize a LabeledPoint with a features vector of any type.
-
-    >>> from pyspark.mllib.regression import LabeledPoint
-    >>> dp0 = LabeledPoint(0.5, array([1.0, 2.0, 3.0, 4.0, -1.0, 0.0, -0.0]))
-    >>> dp1 = _deserialize_labeled_point(_serialize_labeled_point(dp0))
-    >>> dp1.label == dp0.label
-    True
-    >>> array_equal(dp1.features, dp0.features)
-    True
-    >>> sp0 = LabeledPoint(0.0, SparseVector(4, [1, 3], [3.0, 5.5]))
-    >>> sp1 = _deserialize_labeled_point(_serialize_labeled_point(sp0))
-    >>> sp1.label == sp1.label
-    True
-    >>> sp1.features == sp0.features
-    True
-    """
-    from pyspark.mllib.regression import LabeledPoint
-    serialized_features = _serialize_double_vector(p.features)
-    header = bytearray(9)
-    header[0] = LABELED_POINT_MAGIC
-    header_float = ndarray(shape=[1], buffer=header, offset=1, dtype=float64)
-    header_float[0] = p.label
-    return header + serialized_features
-
-
-def _deserialize_labeled_point(ba, offset=0):
-    """Deserialize a LabeledPoint from a mutually understood format."""
-    from pyspark.mllib.regression import LabeledPoint
-    if type(ba) != bytearray:
-        raise TypeError("Expecting a bytearray but got %s" % type(ba))
-    if ba[offset] != LABELED_POINT_MAGIC:
-        raise TypeError("Expecting magic number %d but got %d" %
-                        (LABELED_POINT_MAGIC, ba[0]))
-    label = ndarray(shape=[1], buffer=ba, offset=offset + 1, dtype=float64)[0]
-    features = _deserialize_double_vector(ba, offset + 9)
-    return LabeledPoint(label, features)
-
-
-def _copyto(array, buffer, offset, shape, dtype):
-    """
-    Copy the contents of a vector to a destination bytearray at the
-    given offset.
-
-    TODO: In the future this could use numpy.copyto on NumPy 1.7+, but
-    we should benchmark that to see whether it provides a benefit.
-    """
-    temp_array = ndarray(
-        shape=shape, buffer=buffer, offset=offset, dtype=dtype, order='C')
-    temp_array[...] = array
-
-
-def _get_unmangled_rdd(data, serializer, cache=True):
-    """
-    :param cache:  If True, the serialized RDD is cached.  (default = True)
-                   WARNING: Users should unpersist() this later!
-    """
-    dataBytes = data.map(serializer)
-    dataBytes._bypass_serializer = True
-    if cache:
-        dataBytes.cache()
-    return dataBytes
-
-
-def _get_unmangled_double_vector_rdd(data, cache=True):
-    """
-    Map a pickled Python RDD of Python dense or sparse vectors to a Java RDD of
-    _serialized_double_vectors.
-    :param cache:  If True, the serialized RDD is cached.  (default = True)
-                   WARNING: Users should unpersist() this later!
-    """
-    return _get_unmangled_rdd(data, _serialize_double_vector, cache)
-
-
-<<<<<<< HEAD
-# Map a pickled Python RDD of LabeledPoint to a Java RDD of
-# _serialized_labeled_points
-def _get_unmangled_labeled_point_rdd(data):
-    return _get_unmangled_rdd(data, _serialize_labeled_point)
-=======
-def _get_unmangled_labeled_point_rdd(data, cache=True):
-    """
-    Map a pickled Python RDD of LabeledPoint to a Java RDD of _serialized_labeled_points.
-    :param cache:  If True, the serialized RDD is cached.  (default = True)
-                   WARNING: Users should unpersist() this later!
-    """
-    return _get_unmangled_rdd(data, _serialize_labeled_point, cache)
->>>>>>> 3dc55fdf450b4237f7c592fce56d1467fd206366
-
-
-# Common functions for dealing with and training linear models
-
-def _linear_predictor_typecheck(x, coeffs):
-    """
-    Check that x is a one-dimensional vector of the right shape.
-    This is a temporary hackaround until we actually implement bulk predict.
-    """
-    x = _convert_vector(x)
-    if type(x) == ndarray:
-        if x.ndim == 1:
-            if x.shape != coeffs.shape:
-                raise RuntimeError("Got array of %d elements; wanted %d" % (
-                    numpy.shape(x)[0], coeffs.shape[0]))
-        else:
-            raise RuntimeError("Bulk predict not yet supported.")
-    elif type(x) == SparseVector:
-        if x.size != coeffs.shape[0]:
-            raise RuntimeError("Got sparse vector of size %d; wanted %d" % (
-                x.size, coeffs.shape[0]))
-    elif isinstance(x, RDD):
-        raise RuntimeError("Bulk predict not yet supported.")
-    else:
-        raise TypeError(
-            "Argument of type " + type(x).__name__ + " unsupported")
-
-
-# If we weren't given initial weights, take a zero vector of the appropriate
-# length.
-def _get_initial_weights(initial_weights, data):
-    if initial_weights is None:
-        initial_weights = _convert_vector(data.first().features)
-        if type(initial_weights) == ndarray:
-            if initial_weights.ndim != 1:
-                raise TypeError("At least one data element has "
-                                + initial_weights.ndim + " dimensions, which is not 1")
-            initial_weights = numpy.zeros([initial_weights.shape[0]])
-        elif type(initial_weights) == SparseVector:
-            initial_weights = numpy.zeros([initial_weights.size])
-    return initial_weights
-
-
-# train_func should take two parameters, namely data and initial_weights, and
-# return the result of a call to the appropriate JVM stub.
-# _regression_train_wrapper is responsible for setup and error checking.
-def _regression_train_wrapper(sc, train_func, klass, data, initial_weights):
-    initial_weights = _get_initial_weights(initial_weights, data)
-    dataBytes = _get_unmangled_labeled_point_rdd(data)
-    ans = train_func(dataBytes, _serialize_double_vector(initial_weights))
-    if len(ans) != 2:
-        raise RuntimeError("JVM call result had unexpected length")
-    elif type(ans[0]) != bytearray:
-        raise RuntimeError("JVM call result had first element of type "
-                           + type(ans[0]).__name__ + " which is not bytearray")
-    elif type(ans[1]) != float:
-        raise RuntimeError("JVM call result had second element of type "
-                           + type(ans[0]).__name__ + " which is not float")
-    return klass(_deserialize_double_vector(ans[0]), ans[1])
-
-
-# Functions for serializing ALS Rating objects and tuples
-
-def _serialize_rating(r):
-    ba = bytearray(16)
-    intpart = ndarray(shape=[2], buffer=ba, dtype=int32)
-    doublepart = ndarray(shape=[1], buffer=ba, dtype=float64, offset=8)
-    intpart[0], intpart[1], doublepart[0] = r
-    return ba
-
-
-class RatingDeserializer(Serializer):
-
-    def loads(self, stream):
-        length = struct.unpack("!i", stream.read(4))[0]
-        ba = stream.read(length)
-        res = ndarray(shape=(3, ), buffer=ba, dtype=float64, offset=4)
-        return int(res[0]), int(res[1]), res[2]
-
-    def load_stream(self, stream):
-        while True:
-            try:
-                yield self.loads(stream)
-            except struct.error:
-                return
-            except EOFError:
-                return
-
-
-def _serialize_tuple(t):
-    ba = bytearray(8)
-    intpart = ndarray(shape=[2], buffer=ba, dtype=int32)
-    intpart[0], intpart[1] = t
-    return ba
-
-
-# Vector math functions that support all of our vector types
-
-def _convert_vector(vec):
-    """
-    Convert a vector to a format we support internally. This does
-    the following:
-
-    * For dense NumPy vectors (ndarray), returns them as is
-    * For our SparseVector class, returns that as is
-    * For Python lists, converts them to NumPy vectors
-    * For scipy.sparse.*_matrix column vectors, converts them to
-      our own SparseVector type.
-
-    This should be called before passing any data to our algorithms
-    or attempting to serialize it to Java.
-    """
-    if type(vec) == ndarray or type(vec) == SparseVector:
-        return vec
-    elif type(vec) == list:
-        return array(vec, dtype=float64)
-    elif _have_scipy:
-        if _scipy_issparse(vec):
-            assert vec.shape[1] == 1, "Expected column vector"
-            csc = vec.tocsc()
-            return SparseVector(vec.shape[0], csc.indices, csc.data)
-    raise TypeError(
-        "Expected NumPy array, SparseVector, or scipy.sparse matrix")
-
-
-def _squared_distance(v1, v2):
-    """
-    Squared distance of two NumPy or sparse vectors.
-
-    >>> dense1 = array([1., 2.])
-    >>> sparse1 = SparseVector(2, [0, 1], [1., 2.])
-    >>> dense2 = array([2., 1.])
-    >>> sparse2 = SparseVector(2, [0, 1], [2., 1.])
-    >>> _squared_distance(dense1, dense2)
-    2.0
-    >>> _squared_distance(dense1, sparse2)
-    2.0
-    >>> _squared_distance(sparse1, dense2)
-    2.0
-    >>> _squared_distance(sparse1, sparse2)
-    2.0
-    """
-    v1 = _convert_vector(v1)
-    v2 = _convert_vector(v2)
-    if type(v1) == ndarray and type(v2) == ndarray:
-        diff = v1 - v2
-        return numpy.dot(diff, diff)
-    elif type(v1) == ndarray:
-        return v2.squared_distance(v1)
-    else:
-        return v1.squared_distance(v2)
-
-
-def _dot(vec, target):
-    """
-    Compute the dot product of a vector of the types we support
-    (Numpy array, list, SparseVector, or SciPy sparse) and a target
-    NumPy array that is either 1- or 2-dimensional. Equivalent to
-    calling numpy.dot of the two vectors, but for SciPy ones, we
-    have to transpose them because they're column vectors.
-    """
-    if type(vec) == ndarray:
-        return numpy.dot(vec, target)
-    elif type(vec) == SparseVector:
-        return vec.dot(target)
-    elif type(vec) == list:
-        return numpy.dot(_convert_vector(vec), target)
-    else:
-        return vec.transpose().dot(target)[0]
-
-
-def _test():
-    import doctest
-    globs = globals().copy()
-    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
-    globs['sc'].stop()
-    if failure_count:
-        exit(-1)
-
-
-if __name__ == "__main__":
-    _test()
diff --git a/python/pyspark/mllib/util.py.orig b/python/pyspark/mllib/util.py.orig
deleted file mode 100644
index 8f053aae42e88..0000000000000
--- a/python/pyspark/mllib/util.py.orig
+++ /dev/null
@@ -1,211 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import warnings
-
-from pyspark.mllib.linalg import Vectors, SparseVector
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib._common import _convert_vector, _deserialize_labeled_point
-from pyspark.rdd import RDD
-from pyspark.serializers import NoOpSerializer
-
-
-class MLUtils:
-
-    """
-    Helper methods to load, save and pre-process data used in MLlib.
-    """
-
-    @staticmethod
-    def _parse_libsvm_line(line, multiclass):
-        warnings.warn("deprecated", DeprecationWarning)
-        return _parse_libsvm_line(line)
-
-    @staticmethod
-    def _parse_libsvm_line(line):
-        """
-        Parses a line in LIBSVM format into (label, indices, values).
-        """
-        items = line.split(None)
-        label = float(items[0])
-        nnz = len(items) - 1
-        indices = np.zeros(nnz, dtype=np.int32)
-        values = np.zeros(nnz)
-        for i in xrange(nnz):
-            index, value = items[1 + i].split(":")
-            indices[i] = int(index) - 1
-            values[i] = float(value)
-        return label, indices, values
-
-    @staticmethod
-    def _convert_labeled_point_to_libsvm(p):
-        """Converts a LabeledPoint to a string in LIBSVM format."""
-        items = [str(p.label)]
-        v = _convert_vector(p.features)
-        if type(v) == np.ndarray:
-            for i in xrange(len(v)):
-                items.append(str(i + 1) + ":" + str(v[i]))
-        elif type(v) == SparseVector:
-            nnz = len(v.indices)
-            for i in xrange(nnz):
-                items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
-        else:
-            raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"
-                            " but got " % type(v))
-        return " ".join(items)
-
-    @staticmethod
-    def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
-        warnings.warn("deprecated", DeprecationWarning)
-        return loadLibSVMFile(sc, path, numFeatures, minPartitions)
-
-    @staticmethod
-    def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
-        """
-        Loads labeled data in the LIBSVM format into an RDD of
-        LabeledPoint. The LIBSVM format is a text-based format used by
-        LIBSVM and LIBLINEAR. Each line represents a labeled sparse
-        feature vector using the following format:
-
-        label index1:value1 index2:value2 ...
-
-        where the indices are one-based and in ascending order. This
-        method parses each line into a LabeledPoint, where the feature
-        indices are converted to zero-based.
-
-        @param sc: Spark context
-        @param path: file or directory path in any Hadoop-supported file
-                     system URI
-        @param numFeatures: number of features, which will be determined
-                            from the input data if a nonpositive value
-                            is given. This is useful when the dataset is
-                            already split into multiple files and you
-                            want to load them separately, because some
-                            features may not present in certain files,
-                            which leads to inconsistent feature
-                            dimensions.
-        @param minPartitions: min number of partitions
-        @return: labeled data stored as an RDD of LabeledPoint
-
-        >>> from tempfile import NamedTemporaryFile
-        >>> from pyspark.mllib.util import MLUtils
-        >>> tempFile = NamedTemporaryFile(delete=True)
-        >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
-        >>> tempFile.flush()
-        >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
-        >>> tempFile.close()
-        >>> type(examples[0]) == LabeledPoint
-        True
-        >>> print examples[0]
-        (1.0,(6,[0,2,4],[1.0,2.0,3.0]))
-        >>> type(examples[1]) == LabeledPoint
-        True
-        >>> print examples[1]
-        (-1.0,(6,[],[]))
-        >>> type(examples[2]) == LabeledPoint
-        True
-        >>> print examples[2]
-        (-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
-        """
-
-        lines = sc.textFile(path, minPartitions)
-        parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
-        if numFeatures <= 0:
-            parsed.cache()
-<<<<<<< HEAD
-            numFeatures = parsed.map(
-                lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
-=======
-            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
->>>>>>> 3dc55fdf450b4237f7c592fce56d1467fd206366
-        return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
-
-    @staticmethod
-    def saveAsLibSVMFile(data, dir):
-        """
-        Save labeled data in LIBSVM format.
-
-        @param data: an RDD of LabeledPoint to be saved
-        @param dir: directory to save the data
-
-        >>> from tempfile import NamedTemporaryFile
-        >>> from fileinput import input
-        >>> from glob import glob
-        >>> from pyspark.mllib.util import MLUtils
-        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
-                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
-        >>> tempFile = NamedTemporaryFile(delete=True)
-        >>> tempFile.close()
-        >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
-        >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
-        '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
-        """
-        lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
-        lines.saveAsTextFile(dir)
-
-    @staticmethod
-    def loadLabeledPoints(sc, path, minPartitions=None):
-        """
-        Load labeled points saved using RDD.saveAsTextFile.
-
-        @param sc: Spark context
-        @param path: file or directory path in any Hadoop-supported file
-                     system URI
-        @param minPartitions: min number of partitions
-        @return: labeled data stored as an RDD of LabeledPoint
-
-        >>> from tempfile import NamedTemporaryFile
-        >>> from pyspark.mllib.util import MLUtils
-        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
-                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
-        >>> tempFile = NamedTemporaryFile(delete=True)
-        >>> tempFile.close()
-        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
-        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
-        >>> type(loaded[0]) == LabeledPoint
-        True
-        >>> print examples[0]
-        (1.1,(3,[0,2],[-1.23,4.56e-07]))
-        >>> type(examples[1]) == LabeledPoint
-        True
-        >>> print examples[1]
-        (0.0,[1.01,2.02,3.03])
-        """
-        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
-        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
-            sc._jsc, path, minPartitions)
-        serialized = RDD(jSerialized, sc, NoOpSerializer())
-        return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
-
-
-def _test():
-    import doctest
-    from pyspark.context import SparkContext
-    globs = globals().copy()
-    # The small batch size here ensures that we see multiple batches,
-    # even in these small test examples:
-    globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
-    globs['sc'].stop()
-    if failure_count:
-        exit(-1)
-
-
-if __name__ == "__main__":
-    _test()

From 6f4900b64ac3f6500748f5d0f849e20567976879 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 03:17:24 -0400
Subject: [PATCH 10/23] [SPARK-2627] more misc PEP 8 fixes

---
 python/pyspark/mllib/tests.py | 14 ++++++++------
 python/pyspark/mllib/tree.py  |  7 +++++--
 python/pyspark/mllib/util.py  |  3 ++-
 python/pyspark/sql.py         |  8 +++++---
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 24843d3c4eaf2..a427f7c820200 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -136,7 +136,7 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[2]) <= 0)
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
-        categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
         dt_model = \
             DecisionTree.trainClassifier(rdd, numClasses=2,
                                          categoricalFeaturesInfo=categoricalFeaturesInfo)
@@ -176,9 +176,10 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[2]) <= 0)
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
-        categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
         dt_model = \
-            DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+            DecisionTree.trainRegressor(
+                rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
         self.assertTrue(dt_model.predict(features[0]) <= 0)
         self.assertTrue(dt_model.predict(features[1]) > 0)
         self.assertTrue(dt_model.predict(features[2]) <= 0)
@@ -290,7 +291,7 @@ def test_classification(self):
         self.assertTrue(nb_model.predict(features[2]) <= 0)
         self.assertTrue(nb_model.predict(features[3]) > 0)
 
-        categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
         dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                 categoricalFeaturesInfo=categoricalFeaturesInfo)
         self.assertTrue(dt_model.predict(features[0]) <= 0)
@@ -329,8 +330,9 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[2]) <= 0)
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
-        categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
-        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
+        dt_model = DecisionTree.trainRegressor(
+            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
         self.assertTrue(dt_model.predict(features[0]) <= 0)
         self.assertTrue(dt_model.predict(features[1]) > 0)
         self.assertTrue(dt_model.predict(features[2]) <= 0)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 1e0006df75ac6..5713fa7be0297 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -25,7 +25,9 @@
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.serializers import NoOpSerializer
 
+
 class DecisionTreeModel(object):
+
     """
     A decision tree model for classification or regression.
 
@@ -77,6 +79,7 @@ def __str__(self):
 
 
 class DecisionTree(object):
+
     """
     Learning algorithm for a decision tree model
     for classification or regression.
@@ -174,7 +177,6 @@ def trainRegressor(data, categoricalFeaturesInfo={},
                                   categoricalFeaturesInfo,
                                   impurity, maxDepth, maxBins)
 
-
     @staticmethod
     def train(data, algo, numClasses, categoricalFeaturesInfo,
               impurity, maxDepth, maxBins=100):
@@ -216,7 +218,8 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 9ecceaead346f..75754286cd622 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -127,7 +127,8 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
-            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+            numFeatures = parsed.map(
+                lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index cabd4fa71222d..6fbdfb171b921 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -981,9 +981,10 @@ def registerFunction(self, name, f, returnType=StringType()):
         env = MapConverter().convert(self._sc.environment,
                                      self._sc._gateway._gateway_client)
         includes = ListConverter().convert(self._sc._python_includes,
-                                     self._sc._gateway._gateway_client)
+                                           self._sc._gateway._gateway_client)
         self._ssql_ctx.registerPython(name,
-                                      bytearray(CloudPickleSerializer().dumps(command)),
+                                      bytearray(
+                                          CloudPickleSerializer().dumps(command)),
                                       env,
                                       includes,
                                       self._sc.pythonExec,
@@ -1525,7 +1526,8 @@ def registerTempTable(self, name):
         self._jschema_rdd.registerTempTable(name)
 
     def registerAsTable(self, name):
-        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+        warnings.warn(
+            "Use registerTempTable instead of registerAsTable.", DeprecationWarning)
         self.registerTempTable(name)
 
     def insertInto(self, tableName, overwrite=False):

From 21da538b432a165c2c4ba6eb60ff74b7ed5ab5b4 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 03:25:45 -0400
Subject: [PATCH 11/23] [SPARK-2627] it's PEP 8, not PEP8

Minor, I know.
---
 dev/lint-python | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index 3105e64d33d4b..50e52460be1e7 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -32,10 +32,10 @@ pep8_status=${PIPESTATUS[0]} #$?
 
 if [ $pep8_status -ne 0 ]
     then
-        echo "PEP8 checks failed."
+        echo "PEP 8 checks failed."
         cat "$PEP8_REPORT_PATH"
     else
-        echo "PEP8 checks passed."
+        echo "PEP 8 checks passed."
 fi
 
 rm -f "$PEP8_REPORT_PATH"

From a1ce7ae405754600cf92828bf0db3271c46e4961 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 13:47:50 -0400
Subject: [PATCH 12/23] [SPARK-2627] space out test report sections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Everything looks cramped and it’s hard to tell at a glance where
sections begin. Adding a blank line between sections should fix that.
---
 dev/run-tests | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dev/run-tests b/dev/run-tests
index 32c3f90fa41c8..0e24515d1376c 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -66,21 +66,25 @@ fi
 set -e
 set -o pipefail
 
+echo ""
 echo "========================================================================="
 echo "Running Apache RAT checks"
 echo "========================================================================="
 dev/check-license
 
+echo ""
 echo "========================================================================="
 echo "Running Scala style checks"
 echo "========================================================================="
 dev/lint-scala
 
+echo ""
 echo "========================================================================="
 echo "Running Python style checks"
 echo "========================================================================="
 dev/lint-python
 
+echo ""
 echo "========================================================================="
 echo "Running Spark unit tests"
 echo "========================================================================="
@@ -94,11 +98,13 @@ fi
 echo -e "q\n" | sbt/sbt $SBT_MAVEN_PROFILES_ARGS clean package assembly/assembly test | \
   grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
 
+echo ""
 echo "========================================================================="
 echo "Running PySpark tests"
 echo "========================================================================="
 ./python/run-tests
 
+echo ""
 echo "========================================================================="
 echo "Detecting binary incompatibilites with MiMa"
 echo "========================================================================="

From dffb5ddc465b9e0b1220b5713c17ff4aaf9b7dbc Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 14:11:57 -0400
Subject: [PATCH 13/23] [SPARK-2627] download pep8 at runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See the discussion here:
https://github.com/apache/spark/pull/1744#issuecomment-50982162

Get the pep8 utility at runtime so that it’s not required to be
installed on the build server.
---
 dev/lint-python | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/dev/lint-python b/dev/lint-python
index 50e52460be1e7..2d67084dd9587 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -22,11 +22,21 @@ SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 
 cd $SPARK_ROOT_DIR
+
+# See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
+# Get pep8 at runtime so that we don't rely on it being installed on the build server.
+# TODOs:
+#  - Dynamically determine latest release version of pep8 and use that.
+#  - Download this from a more reliable source. (GitHub raw can be flaky, apparently. (?))
+PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8.py"
+curl --silent -o "$PEP8_SCRIPT_PATH" \
+    "https://raw.githubusercontent.com/jcrocholl/pep8/1.5.7/pep8.py"
+
 # There is no need to write this output to a file
 #+ first, but we do so so that the check status can
 #+ be output before the report, like with the
 #+ scalastyle and RAT checks.
-pep8 ./python --exclude="cloudpickle.py" \
+python $PEP8_SCRIPT_PATH ./python --exclude="cloudpickle.py" \
     > "$PEP8_REPORT_PATH"
 pep8_status=${PIPESTATUS[0]} #$?
 
@@ -39,4 +49,6 @@ if [ $pep8_status -ne 0 ]
 fi
 
 rm -f "$PEP8_REPORT_PATH"
+rm "$PEP8_SCRIPT_PATH"
+
 exit $pep8_status

From d0a83b91c0b192451f0289e62f3448df54c48562 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Sun, 3 Aug 2014 14:44:40 -0400
Subject: [PATCH 14/23] [SPARK-2627] check that pep8 downloaded fine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don’t just assume curl got the file alright. Check and exit properly if
there were any problems.
---
 dev/lint-python | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index 2d67084dd9587..f8ca3072b8f89 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -23,14 +23,23 @@ PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 
 cd $SPARK_ROOT_DIR
 
-# See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
 # Get pep8 at runtime so that we don't rely on it being installed on the build server.
-# TODOs:
-#  - Dynamically determine latest release version of pep8 and use that.
-#  - Download this from a more reliable source. (GitHub raw can be flaky, apparently. (?))
+#+ See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
+#+ TODOs:
+#+  - Dynamically determine latest release version of pep8 and use that.
+#+  - Download this from a more reliable source. (GitHub raw can be flaky, apparently. (?))
 PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8.py"
-curl --silent -o "$PEP8_SCRIPT_PATH" \
-    "https://raw.githubusercontent.com/jcrocholl/pep8/1.5.7/pep8.py"
+PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/1.5.7/pep8.py"
+
+curl --silent -o "$PEP8_SCRIPT_PATH" "$PEP8_SCRIPT_REMOTE_PATH"    
+curl_status=$?
+
+if [ $curl_status -ne 0 ]
+    then
+        echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"."
+        exit $curl_status
+fi
+
 
 # There is no need to write this output to a file
 #+ first, but we do so so that the check status can

From aa5b4b50047573123db9f5b5c47989330a268846 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 16:57:42 -0400
Subject: [PATCH 15/23] [SPARK-2627] follow Spark bash style for if blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

“then” goes on the same line as the opening “if”.
---
 dev/lint-python | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index f8ca3072b8f89..9913313ab349c 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -34,10 +34,9 @@ PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/1.5.7/
 curl --silent -o "$PEP8_SCRIPT_PATH" "$PEP8_SCRIPT_REMOTE_PATH"    
 curl_status=$?
 
-if [ $curl_status -ne 0 ]
-    then
-        echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"."
-        exit $curl_status
+if [ $curl_status -ne 0 ]; then
+    echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"."
+    exit $curl_status
 fi
 
 
@@ -49,12 +48,11 @@ python $PEP8_SCRIPT_PATH ./python --exclude="cloudpickle.py" \
     > "$PEP8_REPORT_PATH"
 pep8_status=${PIPESTATUS[0]} #$?
 
-if [ $pep8_status -ne 0 ]
-    then
-        echo "PEP 8 checks failed."
-        cat "$PEP8_REPORT_PATH"
-    else
-        echo "PEP 8 checks passed."
+if [ $pep8_status -ne 0 ]; then
+    echo "PEP 8 checks failed."
+    cat "$PEP8_REPORT_PATH"
+else
+    echo "PEP 8 checks passed."
 fi
 
 rm -f "$PEP8_REPORT_PATH"

From bfb9f9f12fa04910ebc5f53e6a5538d8341139b8 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 17:20:14 -0400
Subject: [PATCH 16/23] [SPARK-2627] keep up with the PEP 8 fixes

---
 python/pyspark/daemon.py      |  3 +--
 python/pyspark/serializers.py | 11 ++++++-----
 python/pyspark/tests.py       |  9 ++++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
index 03771de91afe6..b1b7f14da959b 100644
--- a/python/pyspark/daemon.py
+++ b/python/pyspark/daemon.py
@@ -138,8 +138,7 @@ def handle_sigchld(*args):
                 try:
                     os.kill(worker_pid, signal.SIGKILL)
                 except OSError:
-                    pass # process already died
-
+                    pass  # process already died
 
             if listen_sock in ready_fds:
                 sock, addr = listen_sock.accept()
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index d1628d87644ba..5b6fa2c609537 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -293,6 +293,7 @@ def _hack_namedtuple(cls):
     """ Make class generated by namedtuple picklable """
     name = cls.__name__
     fields = cls._fields
+
     def __reduce__(self):
         return (_restore, (name, fields, tuple(self)))
     cls.__reduce__ = __reduce__
@@ -301,11 +302,11 @@ def __reduce__(self):
 
 def _hijack_namedtuple():
     """ Hack namedtuple() to make it picklable """
-    global _old_namedtuple # or it will put in closure
+    global _old_namedtuple  # or it will put in closure
 
     def _copy_func(f):
         return types.FunctionType(f.func_code, f.func_globals, f.func_name,
-                f.func_defaults, f.func_closure)
+                                  f.func_defaults, f.func_closure)
 
     _old_namedtuple = _copy_func(collections.namedtuple)
 
@@ -323,9 +324,9 @@ def namedtuple(name, fields, verbose=False, rename=False):
     # so only hack those in __main__ module
     for n, o in sys.modules["__main__"].__dict__.iteritems():
         if (type(o) is type and o.__base__ is tuple
-            and hasattr(o, "_fields")
-            and "__reduce__" not in o.__dict__):
-            _hack_namedtuple(o) # hack inplace
+                and hasattr(o, "_fields")
+                and "__reduce__" not in o.__dict__):
+            _hack_namedtuple(o)  # hack inplace
 
 
 _hijack_namedtuple()
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 69eeaaa830b84..ec2553ae46649 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -840,12 +840,15 @@ def test_termination_sigterm(self):
 
 
 class TestWorker(PySparkTestCase):
+
     def test_cancel_task(self):
         temp = tempfile.NamedTemporaryFile(delete=True)
         temp.close()
         path = temp.name
+
         def sleep(x):
-            import os, time
+            import os
+            import time
             with open(path, 'w') as f:
                 f.write("%d %d" % (os.getppid(), os.getpid()))
             time.sleep(100)
@@ -875,7 +878,7 @@ def run():
                 os.kill(worker_pid, 0)
                 time.sleep(0.1)
             except OSError:
-                break # worker was killed
+                break  # worker was killed
         else:
             self.fail("worker has not been killed after 5 seconds")
 
@@ -885,7 +888,7 @@ def run():
             self.fail("daemon had been killed")
 
     def test_fd_leak(self):
-        N = 1100 # fd limit is 1024 by default
+        N = 1100  # fd limit is 1024 by default
         rdd = self.sc.parallelize(range(N), N)
         self.assertEquals(N, rdd.count())
 

From b09fae2af51439375c56776b567ef561c44a31d1 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 17:32:18 -0400
Subject: [PATCH 17/23] don't wrap comments unnecessarily

---
 python/pyspark/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index ac55fcdc2057b..c58555fc9d2c5 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -53,8 +53,7 @@
 # mllib that depend on top level pyspark packages, which transitively depend on python's random.
 # Since Python's import logic looks for modules in the current package first, we eliminate
 # mllib.random as a candidate for C{import random} by removing the first search path, the script's
-# location, in order to force the loader to look in Python's top-level
-# modules for C{random}.
+# location, in order to force the loader to look in Python's top-level modules for C{random}.
 import sys
 s = sys.path.pop(0)
 import random

From 44e3e56e14ef9d4edd2849623577c1e9282f81ed Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 19:23:55 -0400
Subject: [PATCH 18/23] [SPARK-2627] use tox.ini to exclude files

No need to exclude files in the call to pep8. The tool references
tox.ini for config information, so we should use that.
---
 dev/lint-python | 3 +--
 tox.ini         | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index 9913313ab349c..4efddad839387 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -44,8 +44,7 @@ fi
 #+ first, but we do so so that the check status can
 #+ be output before the report, like with the
 #+ scalastyle and RAT checks.
-python $PEP8_SCRIPT_PATH ./python --exclude="cloudpickle.py" \
-    > "$PEP8_REPORT_PATH"
+python $PEP8_SCRIPT_PATH ./python > "$PEP8_REPORT_PATH"
 pep8_status=${PIPESTATUS[0]} #$?
 
 if [ $pep8_status -ne 0 ]; then
diff --git a/tox.ini b/tox.ini
index 44766e529bf7f..a1fefdd0e176f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,3 +15,4 @@
 
 [pep8]
 max-line-length=100
+exclude=cloudpickle.py

From 91b7584a5c43700bd559ed29868af4fba97f26ea Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 19:25:11 -0400
Subject: [PATCH 19/23] [SPARK-2627] undo unnecessary line breaks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

“Undo” unnecessary line breaks introduced by accident when I called
autopep8 on the whole Python directory without setting the max line
length to 100. (autopep8 defaults to 79.)
---
 python/pyspark/accumulators.py         |  12 +--
 python/pyspark/conf.py                 |   3 +-
 python/pyspark/context.py              | 104 ++++++++++---------------
 python/pyspark/daemon.py               |  12 +--
 python/pyspark/java_gateway.py         |   6 +-
 python/pyspark/mllib/_common.py        |  30 +++----
 python/pyspark/mllib/classification.py |   6 +-
 python/pyspark/mllib/clustering.py     |   3 +-
 python/pyspark/mllib/linalg.py         |  15 ++--
 python/pyspark/mllib/random.py         |  12 +--
 python/pyspark/mllib/recommendation.py |   3 +-
 python/pyspark/mllib/regression.py     |   6 +-
 python/pyspark/mllib/stat.py           |   9 +--
 python/pyspark/mllib/tests.py          |  39 ++++------
 python/pyspark/mllib/tree.py           |   3 +-
 python/pyspark/mllib/util.py           |   9 +--
 python/pyspark/rdd.py                  |  16 ++--
 python/pyspark/rddsampler.py           |   6 +-
 python/pyspark/shell.py                |   3 +-
 python/pyspark/sql.py                  |   6 +-
 python/pyspark/statcounter.py          |  12 +--
 python/pyspark/tests.py                |  79 +++++++------------
 python/pyspark/worker.py               |   3 +-
 23 files changed, 143 insertions(+), 254 deletions(-)

diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py
index f85e2501f9b03..f133cf6f7befc 100644
--- a/python/pyspark/accumulators.py
+++ b/python/pyspark/accumulators.py
@@ -97,8 +97,7 @@
 pickleSer = PickleSerializer()
 
 # Holds accumulators registered on the current machine, keyed by ID. This is then used to send
-# the local accumulator updates back to the driver program at the end of a
-# task.
+# the local accumulator updates back to the driver program at the end of a task.
 _accumulatorRegistry = {}
 
 
@@ -141,16 +140,14 @@ def __reduce__(self):
     def value(self):
         """Get the accumulator's value; only usable in driver program"""
         if self._deserialized:
-            raise Exception(
-                "Accumulator.value cannot be accessed inside tasks")
+            raise Exception("Accumulator.value cannot be accessed inside tasks")
         return self._value
 
     @value.setter
     def value(self, value):
         """Sets the accumulator's value; only usable in driver program"""
         if self._deserialized:
-            raise Exception(
-                "Accumulator.value cannot be accessed inside tasks")
+            raise Exception("Accumulator.value cannot be accessed inside tasks")
         self._value = value
 
     def add(self, term):
@@ -225,8 +222,7 @@ class _UpdateRequestHandler(SocketServer.StreamRequestHandler):
     def handle(self):
         from pyspark.accumulators import _accumulatorRegistry
         while not self.server.server_shutdown:
-            # Poll every 1 second for new data -- don't block in case of
-            # shutdown.
+            # Poll every 1 second for new data -- don't block in case of shutdown.
             r, _, _ = select.select([self.rfile], [], [], 1)
             if self.rfile in r:
                 num_updates = read_int(self.rfile)
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index ebfc0ad003b3b..fb716f6753a45 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -125,8 +125,7 @@ def setSparkHome(self, value):
     def setExecutorEnv(self, key=None, value=None, pairs=None):
         """Set an environment variable to be passed to executors."""
         if (key is not None and pairs is not None) or (key is None and pairs is None):
-            raise Exception(
-                "Either pass one key-value pair or a list of pairs")
+            raise Exception("Either pass one key-value pair or a list of pairs")
         elif key is not None:
             self._jconf.setExecutorEnv(key, value)
         elif pairs is not None:
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 3ecd9a58f5b74..840925d3e31e9 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -59,8 +59,7 @@ class SparkContext(object):
     _writeToFile = None
     _next_accum_id = 0
     _active_spark_context = None
-    _lock = Lock()
-    # zip and egg files that need to be added to PYTHONPATH
+    _lock = Lock()  # zip and egg files that need to be added to PYTHONPATH
     _python_includes = None
     _default_batch_size_for_serialized_input = 10
 
@@ -101,15 +100,13 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
             self._callsite = rdd._extract_concise_traceback()
         else:
             tempNamedTuple = namedtuple("Callsite", "function file linenum")
-            self._callsite = tempNamedTuple(
-                function=None, file=None, linenum=None)
+            self._callsite = tempNamedTuple(function=None, file=None, linenum=None)
         SparkContext._ensure_initialized(self, gateway=gateway)
         try:
             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                           conf)
         except:
-            # If an error occurs, clean up in order to allow future
-            # SparkContext creation:
+            # If an error occurs, clean up in order to allow future SparkContext creation:
             self.stop()
             raise
 
@@ -142,8 +139,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         if not self._conf.contains("spark.master"):
             raise Exception("A master URL must be set in your configuration")
         if not self._conf.contains("spark.app.name"):
-            raise Exception(
-                "An application name must be set in your configuration")
+            raise Exception("An application name must be set in your configuration")
 
         # Read back our properties from the conf in case we loaded some of them from
         # the classpath or an external config file
@@ -184,8 +180,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
             self.addPyFile(path)
 
         # Deploy code dependencies set by spark-submit; these will already have been added
-        # with SparkContext.addFile, so we just need to add them to the
-        # PYTHONPATH
+        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
         for path in self._conf.get("spark.submit.pyFiles", "").split(","):
             if path != "":
                 (dirname, filename) = os.path.split(path)
@@ -195,11 +190,9 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
                     sys.path.append(dirname)
 
         # Create a temporary directory inside spark.local.dir:
-        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
-            self._jsc.sc().conf())
+        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
         self._temp_dir = \
-            self._jvm.org.apache.spark.util.Utils.createTempDir(
-                local_dir).getAbsolutePath()
+            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
 
     def _initialize_context(self, jconf):
         """
@@ -292,8 +285,7 @@ def parallelize(self, c, numSlices=None):
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
         tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
-        # Make sure we distribute data evenly if it's smaller than
-        # self.batchSize
+        # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(c):
             c = list(c)    # Make it a list so we can compute its length
         batchSize = min(len(c) // numSlices, self._batchSize)
@@ -412,10 +404,8 @@ def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         minSplits = minSplits or min(self.defaultParallelism, 2)
-        batchSize = max(
-            1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (
-            batchSize > 1) else PickleSerializer()
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
         jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass,
                                                 keyConverter, valueConverter, minSplits, batchSize)
         return RDD(jrdd, self, ser)
@@ -445,13 +435,11 @@ def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConv
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(
-            1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (
-            batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.newAPIHadoopFile(
-            self._jsc, path, inputFormatClass, keyClass,
-            valueClass, keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,
+                                                    valueClass, keyConverter, valueConverter,
+                                                    jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -476,13 +464,11 @@ def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=N
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(
-            1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (
-            batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(
-            self._jsc, inputFormatClass, keyClass,
-            valueClass, keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
+                                                   valueClass, keyConverter, valueConverter,
+                                                   jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -510,13 +496,11 @@ def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(
-            1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (
-            batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.hadoopFile(
-            self._jsc, path, inputFormatClass, keyClass,
-            valueClass, keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass,
+                                              valueClass, keyConverter, valueConverter,
+                                              jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
@@ -541,12 +525,11 @@ def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
                Java object. (default sc._default_batch_size_for_serialized_input)
         """
         jconf = self._dictToJavaMap(conf)
-        batchSize = max(
-            1, batchSize or self._default_batch_size_for_serialized_input)
-        ser = BatchedSerializer(PickleSerializer()) if (
-            batchSize > 1) else PickleSerializer()
-        jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass,
-                                             keyConverter, valueConverter, jconf, batchSize)
+        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
+        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
+        jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass,
+                                             valueClass, keyConverter, valueConverter,
+                                             jconf, batchSize)
         return RDD(jrdd, self, ser)
 
     def _checkpointFile(self, name, input_deserializer):
@@ -577,8 +560,7 @@ def union(self, rdds):
         first = rdds[0]._jrdd
         rest = [x._jrdd for x in rdds[1:]]
         rest = ListConverter().convert(rest, self._gateway._gateway_client)
-        return RDD(self._jsc.union(first, rest), self,
-                   rdds[0]._jrdd_deserializer)
+        return RDD(self._jsc.union(first, rest), self, rdds[0]._jrdd_deserializer)
 
     def broadcast(self, value):
         """
@@ -590,8 +572,7 @@ def broadcast(self, value):
         pickleSer = PickleSerializer()
         pickled = pickleSer.dumps(value)
         jbroadcast = self._jsc.broadcast(bytearray(pickled))
-        return Broadcast(jbroadcast.id(), value, jbroadcast,
-                         self._pickled_broadcast_vars)
+        return Broadcast(jbroadcast.id(), value, jbroadcast, self._pickled_broadcast_vars)
 
     def accumulator(self, value, accum_param=None):
         """
@@ -609,8 +590,7 @@ def accumulator(self, value, accum_param=None):
             elif isinstance(value, complex):
                 accum_param = accumulators.COMPLEX_ACCUMULATOR_PARAM
             else:
-                raise Exception(
-                    "No default accumulator param for type %s" % type(value))
+                raise Exception("No default accumulator param for type %s" % type(value))
         SparkContext._next_accum_id += 1
         return Accumulator(SparkContext._next_accum_id - 1, value, accum_param)
 
@@ -655,14 +635,12 @@ def addPyFile(self, path):
         HTTP, HTTPS or FTP URI.
         """
         self.addFile(path)
-        # dirname may be directory or HDFS/S3 prefix
-        (dirname, filename) = os.path.split(path)
+        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
 
         if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
             self._python_includes.append(filename)
             # for tests in local mode
-            sys.path.append(
-                os.path.join(SparkFiles.getRootDirectory(), filename))
+            sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename))
 
     def setCheckpointDir(self, dirName):
         """
@@ -676,8 +654,7 @@ def _getJavaStorageLevel(self, storageLevel):
         Returns a Java StorageLevel based on a pyspark.StorageLevel.
         """
         if not isinstance(storageLevel, StorageLevel):
-            raise Exception(
-                "storageLevel must be of type pyspark.StorageLevel")
+            raise Exception("storageLevel must be of type pyspark.StorageLevel")
 
         newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel
         return newStorageLevel(storageLevel.useDisk,
@@ -780,15 +757,13 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         """
         if partitions is None:
             partitions = range(rdd._jrdd.partitions().size())
-        javaPartitions = ListConverter().convert(
-            partitions, self._gateway._gateway_client)
+        javaPartitions = ListConverter().convert(partitions, self._gateway._gateway_client)
 
         # Implementation note: This is implemented as a mapPartitions followed
         # by runJob() in order to avoid having to pass a Python lambda into
         # SparkContext#runJob.
         mappedRDD = rdd.mapPartitions(partitionFunc)
-        it = self._jvm.PythonRDD.runJob(
-            self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
+        it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
         return list(mappedRDD._collect_iterator_through_file(it))
 
 
@@ -800,8 +775,7 @@ def _test():
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
     globs['tempdir'] = tempfile.mkdtemp()
     atexit.register(lambda: shutil.rmtree(globs['tempdir']))
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
index b1b7f14da959b..e73538baf0b93 100644
--- a/python/pyspark/daemon.py
+++ b/python/pyspark/daemon.py
@@ -30,8 +30,7 @@
 
 
 def compute_real_exit_code(exit_code):
-    # SystemExit's code can be integer or string, but os._exit only accepts
-    # integers
+    # SystemExit's code can be integer or string, but os._exit only accepts integers
     if isinstance(exit_code, numbers.Integral):
         return exit_code
     else:
@@ -44,8 +43,7 @@ def worker(sock):
     """
     # Redirect stdout to stderr
     os.dup2(2, 1)
-    # The sys.stdout object is different from file descriptor 1
-    sys.stdout = sys.stderr
+    sys.stdout = sys.stderr  # The sys.stdout object is different from file descriptor 1
 
     signal.signal(SIGHUP, SIG_DFL)
     signal.signal(SIGCHLD, SIG_DFL)
@@ -64,8 +62,7 @@ def waitSocketClose(sock):
 
     # Read the socket using fdopen instead of socket.makefile() because the latter
     # seems to be very slow; note that we need to dup() the file descriptor because
-    # otherwise writes also cause a seek that makes us miss data on the read
-    # side.
+    # otherwise writes also cause a seek that makes us miss data on the read side.
     infile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
     outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536)
     exit_code = 0
@@ -110,8 +107,7 @@ def handle_sigchld(*args):
         try:
             pid, status = os.waitpid(0, os.WNOHANG)
             if status != 0:
-                msg = "worker %s crashed abruptly with exit status %s" % (
-                    pid, status)
+                msg = "worker %s crashed abruptly with exit status %s" % (pid, status)
                 print >> sys.stderr, msg
         except EnvironmentError as err:
             if err.errno not in (ECHILD, EINTR):
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 2be2282f0c80b..37386ab0d7d49 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -39,14 +39,12 @@ def launch_gateway():
         submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
         submit_args = submit_args if submit_args is not None else ""
         submit_args = shlex.split(submit_args)
-        command = [
-            os.path.join(SPARK_HOME, script), "pyspark-shell"] + submit_args
+        command = [os.path.join(SPARK_HOME, script), "pyspark-shell"] + submit_args
         if not on_windows:
             # Don't send ctrl-c / SIGINT to the Java gateway:
             def preexec_func():
                 signal.signal(signal.SIGINT, signal.SIG_IGN)
-            proc = Popen(
-                command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
+            proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
         else:
             # preexec_fn not supported on Windows
             proc = Popen(command, stdout=PIPE, stdin=PIPE)
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index 1503c567f13ea..db341da85f865 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -35,8 +35,7 @@
 
 
 # Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
-# such as _dot and _serialize_double_vector, start to support scipy.sparse
-# matrices.
+# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
 
 _have_scipy = False
 _scipy_issparse = None
@@ -165,8 +164,7 @@ def _serialize_sparse_vector(v):
     header[1] = nonzeros
     _copyto(v.indices, buffer=ba, offset=9, shape=[nonzeros], dtype=int32)
     values_offset = 9 + 4 * nonzeros
-    _copyto(v.values, buffer=ba, offset=values_offset,
-            shape=[nonzeros], dtype=float64)
+    _copyto(v.values, buffer=ba, offset=values_offset, shape=[nonzeros], dtype=float64)
     return ba
 
 
@@ -190,11 +188,9 @@ def _deserialize_double(ba, offset=0):
     True
     """
     if type(ba) != bytearray:
-        raise TypeError(
-            "_deserialize_double called on a %s; wanted bytearray" % type(ba))
+        raise TypeError("_deserialize_double called on a %s; wanted bytearray" % type(ba))
     if len(ba) - offset != 8:
-        raise TypeError(
-            "_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
+        raise TypeError("_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
     return struct.unpack("d", ba[offset:])[0]
 
 
@@ -250,8 +246,7 @@ def _deserialize_sparse_vector(ba, offset=0):
         raise TypeError("_deserialize_sparse_vector called on bytearray "
                         "with wrong length")
     indices = _deserialize_numpy_array([nonzeros], ba, offset + 9, dtype=int32)
-    values = _deserialize_numpy_array(
-        [nonzeros], ba, offset + 9 + 4 * nonzeros, dtype=float64)
+    values = _deserialize_numpy_array([nonzeros], ba, offset + 9 + 4 * nonzeros, dtype=float64)
     return SparseVector(int(size), indices, values)
 
 
@@ -330,8 +325,7 @@ def _deserialize_labeled_point(ba, offset=0):
     if type(ba) != bytearray:
         raise TypeError("Expecting a bytearray but got %s" % type(ba))
     if ba[offset] != LABELED_POINT_MAGIC:
-        raise TypeError("Expecting magic number %d but got %d" %
-                        (LABELED_POINT_MAGIC, ba[0]))
+        raise TypeError("Expecting magic number %d but got %d" % (LABELED_POINT_MAGIC, ba[0]))
     label = ndarray(shape=[1], buffer=ba, offset=offset + 1, dtype=float64)[0]
     features = _deserialize_double_vector(ba, offset + 9)
     return LabeledPoint(label, features)
@@ -345,8 +339,7 @@ def _copyto(array, buffer, offset, shape, dtype):
     TODO: In the future this could use numpy.copyto on NumPy 1.7+, but
     we should benchmark that to see whether it provides a benefit.
     """
-    temp_array = ndarray(
-        shape=shape, buffer=buffer, offset=offset, dtype=dtype, order='C')
+    temp_array = ndarray(shape=shape, buffer=buffer, offset=offset, dtype=dtype, order='C')
     temp_array[...] = array
 
 
@@ -403,8 +396,7 @@ def _linear_predictor_typecheck(x, coeffs):
     elif isinstance(x, RDD):
         raise RuntimeError("Bulk predict not yet supported.")
     else:
-        raise TypeError(
-            "Argument of type " + type(x).__name__ + " unsupported")
+        raise TypeError("Argument of type " + type(x).__name__ + " unsupported")
 
 
 # If we weren't given initial weights, take a zero vector of the appropriate
@@ -500,8 +492,7 @@ def _convert_vector(vec):
             assert vec.shape[1] == 1, "Expected column vector"
             csc = vec.tocsc()
             return SparseVector(vec.shape[0], csc.indices, csc.data)
-    raise TypeError(
-        "Expected NumPy array, SparseVector, or scipy.sparse matrix")
+    raise TypeError("Expected NumPy array, SparseVector, or scipy.sparse matrix")
 
 
 def _squared_distance(v1, v2):
@@ -554,8 +545,7 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 2e31665105c62..a85abbcd02c79 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -190,8 +190,7 @@ def train(cls, data, lambda_=1.0):
         """
         sc = data.context
         dataBytes = _get_unmangled_labeled_point_rdd(data)
-        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(
-            dataBytes._jrdd, lambda_)
+        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_)
         return NaiveBayesModel(
             _deserialize_double_vector(ans[0]),
             _deserialize_double_vector(ans[1]),
@@ -202,8 +201,7 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 7e974d7186bf7..a0630d1d5c58b 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -99,8 +99,7 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 41b1f2976893b..9a239abfbbeb1 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -60,8 +60,7 @@ def __init__(self, size, *args):
             self.indices = array([p[0] for p in pairs], dtype=int32)
             self.values = array([p[1] for p in pairs], dtype=float64)
         else:
-            assert len(args[0]) == len(
-                args[1]), "index and value arrays not same length"
+            assert len(args[0]) == len(args[1]), "index and value arrays not same length"
             self.indices = array(args[0], dtype=int32)
             self.values = array(args[1], dtype=float64)
             for i in xrange(len(self.indices) - 1):
@@ -90,12 +89,10 @@ def dot(self, other):
                     result += self.values[i] * other[self.indices[i]]
                 return result
             elif other.ndim == 2:
-                results = [self.dot(other[:, i])
-                           for i in xrange(other.shape[1])]
+                results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
                 return array(results)
             else:
-                raise Exception(
-                    "Cannot call dot with %d-dimensional array" % other.ndim)
+                raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
         else:
             result = 0.0
             i, j = 0, 0
@@ -171,8 +168,7 @@ def __str__(self):
     def __repr__(self):
         inds = self.indices
         vals = self.values
-        entries = ", ".join(
-            ["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
+        entries = ", ".join(["{0}: {1}".format(inds[i], vals[i]) for i in xrange(len(inds))])
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
@@ -262,8 +258,7 @@ def _test():
 
 if __name__ == "__main__":
     # remove current path from list of search paths to avoid importing mllib.random
-    # for C{import random}, which is done in an external dependency of pyspark
-    # during doctests.
+    # for C{import random}, which is done in an external dependency of pyspark during doctests.
     import sys
     sys.path.pop(0)
     _test()
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 1b51da913c4b3..eb496688b6eef 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -54,8 +54,7 @@ def uniformRDD(sc, size, numPartitions=None, seed=None):
         >>> parts == sc.defaultParallelism
         True
         """
-        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(
-            sc._jsc, size, numPartitions, seed)
+        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
         uniform = RDD(jrdd, sc, NoOpSerializer())
         return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
 
@@ -79,8 +78,7 @@ def normalRDD(sc, size, numPartitions=None, seed=None):
         >>> abs(stats.stdev() - 1.0) < 0.1
         True
         """
-        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(
-            sc._jsc, size, numPartitions, seed)
+        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
         normal = RDD(jrdd, sc, NoOpSerializer())
         return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
 
@@ -101,8 +99,7 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         >>> abs(stats.stdev() - sqrt(mean)) < 0.5
         True
         """
-        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(
-            sc._jsc, mean, size, numPartitions, seed)
+        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
         poisson = RDD(jrdd, sc, NoOpSerializer())
         return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
 
@@ -177,8 +174,7 @@ def _test():
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index c398b008fda54..e863fc249ec36 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -81,8 +81,7 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index c23ec7e44ca6d..d8792cf44872f 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -44,8 +44,7 @@ def __init__(self, label, features):
         elif type(features) == list:
             self.features = array(features)
         else:
-            raise TypeError(
-                "Expected NumPy array, list, SparseVector, or scipy.sparse matrix")
+            raise TypeError("Expected NumPy array, list, SparseVector, or scipy.sparse matrix")
 
     def __str__(self):
         return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"
@@ -248,8 +247,7 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index dbf2a03bb3a73..982906b9d09f0 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -80,15 +80,13 @@ def corr(x, y=None, method=None):
             try:
                 Xser = _get_unmangled_double_vector_rdd(x)
             except TypeError:
-                raise TypeError(
-                    "corr called on a single RDD not consisted of Vectors.")
+                raise TypeError("corr called on a single RDD not consisted of Vectors.")
             resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
             return _deserialize_double_matrix(resultMat)
         else:
             xSer = _get_unmangled_rdd(x, _serialize_double)
             ySer = _get_unmangled_rdd(y, _serialize_double)
-            result = sc._jvm.PythonMLLibAPI().corr(
-                xSer._jrdd, ySer._jrdd, method)
+            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
             return result
 
 
@@ -97,8 +95,7 @@ def _test():
     from pyspark import SparkContext
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index a427f7c820200..6f3ec8ac94bac 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -47,12 +47,9 @@ def test_serialize(self):
         self.assertTrue(sv is _convert_vector(sv))
         self.assertTrue(dv is _convert_vector(dv))
         self.assertTrue(array_equal(dv, _convert_vector(lst)))
-        self.assertEquals(
-            sv, _deserialize_double_vector(_serialize_double_vector(sv)))
-        self.assertTrue(
-            array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
-        self.assertTrue(
-            array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
+        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(sv)))
+        self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
+        self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
 
     def test_dot(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -65,11 +62,9 @@ def test_dot(self):
         self.assertEquals(10.0, _dot(sv, dv))
         self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(sv, mat)))
         self.assertEquals(30.0, _dot(dv, dv))
-        self.assertTrue(
-            array_equal(array([10., 20., 30., 40.]), _dot(dv, mat)))
+        self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(dv, mat)))
         self.assertEquals(30.0, _dot(lst, dv))
-        self.assertTrue(
-            array_equal(array([10., 20., 30., 40.]), _dot(lst, mat)))
+        self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(lst, mat)))
 
     def test_squared_distance(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -101,8 +96,7 @@ def test_clustering(self):
             [1.1, 0],
             [1.2, 0],
         ]
-        clusters = KMeans.train(
-            self.sc.parallelize(data), 2, initializationMode="k-means||")
+        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
         self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
@@ -178,8 +172,7 @@ def test_regression(self):
 
         categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
         dt_model = \
-            DecisionTree.trainRegressor(
-                rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+            DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
         self.assertTrue(dt_model.predict(features[0]) <= 0)
         self.assertTrue(dt_model.predict(features[1]) > 0)
         self.assertTrue(dt_model.predict(features[2]) <= 0)
@@ -205,14 +198,10 @@ def test_serialize(self):
         self.assertEquals(sv, _convert_vector(lil.tocoo()))
         self.assertEquals(sv, _convert_vector(lil.tocsr()))
         self.assertEquals(sv, _convert_vector(lil.todok()))
-        self.assertEquals(
-            sv, _deserialize_double_vector(_serialize_double_vector(lil)))
-        self.assertEquals(
-            sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
-        self.assertEquals(
-            sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
-        self.assertEquals(
-            sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
+        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil)))
+        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
+        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
+        self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
 
     def test_dot(self):
         from scipy.sparse import lil_matrix
@@ -256,8 +245,7 @@ def test_clustering(self):
             self.scipy_matrix(3, {2: 1.0}),
             self.scipy_matrix(3, {2: 1.1})
         ]
-        clusters = KMeans.train(
-            self.sc.parallelize(data), 2, initializationMode="k-means||")
+        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
         self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
@@ -331,8 +319,7 @@ def test_regression(self):
         self.assertTrue(rr_model.predict(features[3]) > 0)
 
         categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
-        dt_model = DecisionTree.trainRegressor(
-            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
+        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
         self.assertTrue(dt_model.predict(features[0]) <= 0)
         self.assertTrue(dt_model.predict(features[1]) > 0)
         self.assertTrue(dt_model.predict(features[2]) <= 0)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 5713fa7be0297..2518001ea0b93 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -218,8 +218,7 @@ def _test():
     import doctest
     globs = globals().copy()
     globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 75754286cd622..4962d05491c03 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -127,8 +127,7 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
         parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
         if numFeatures <= 0:
             parsed.cache()
-            numFeatures = parsed.map(
-                lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
+            numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
@@ -183,8 +182,7 @@ def loadLabeledPoints(sc, path, minPartitions=None):
         (0.0,[1.01,2.02,3.03])
         """
         minPartitions = minPartitions or min(sc.defaultParallelism, 2)
-        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
-            sc._jsc, path, minPartitions)
+        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
         serialized = RDD(jSerialized, sc, NoOpSerializer())
         return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
 
@@ -196,8 +194,7 @@ def _test():
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
-    (failure_count, test_count) = doctest.testmod(
-        globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 374535587cfde..30b834d2085cd 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1078,9 +1078,10 @@ def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueCl
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
         batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
-        self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(
-            pickledRDD._jrdd, batched, path,
-            outputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf)
+        self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd, batched, path,
+                                                       outputFormatClass,
+                                                       keyClass, valueClass,
+                                                       keyConverter, valueConverter, jconf)
 
     def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None):
         """
@@ -1125,10 +1126,11 @@ def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=No
         jconf = self.ctx._dictToJavaMap(conf)
         pickledRDD = self._toPickleSerialization()
         batched = isinstance(pickledRDD._jrdd_deserializer, BatchedSerializer)
-        self.ctx._jvm.PythonRDD.saveAsHadoopFile(
-            pickledRDD._jrdd, batched, path,
-            outputFormatClass, keyClass, valueClass, keyConverter, valueConverter,
-            jconf, compressionCodecClass)
+        self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickledRDD._jrdd, batched, path,
+                                                 outputFormatClass,
+                                                 keyClass, valueClass,
+                                                 keyConverter, valueConverter,
+                                                 jconf, compressionCodecClass)
 
     def saveAsSequenceFile(self, path, compressionCodecClass=None):
         """
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
index 35940191c70a3..55e247da0e4dc 100644
--- a/python/pyspark/rddsampler.py
+++ b/python/pyspark/rddsampler.py
@@ -31,8 +31,7 @@ def __init__(self, withReplacement, seed=None):
                 "Falling back to default random generator for sampling.")
             self._use_numpy = False
 
-        self._seed = seed if seed is not None else random.randint(
-            0, sys.maxint)
+        self._seed = seed if seed is not None else random.randint(0, sys.maxint)
         self._withReplacement = withReplacement
         self._random = None
         self._split = None
@@ -87,8 +86,7 @@ def getPoissonSample(self, split, mean):
 
     def shuffle(self, vals):
         if self._random is None:
-            # this should only ever called on the master so
-            self.initRandomGenerator(0)
+            self.initRandomGenerator(0)  # this should only ever called on the master so
             # the split does not matter
 
         if self._use_numpy:
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index be22020fb827b..e1e7cd954189f 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -39,8 +39,7 @@
              if os.environ.get("ADD_FILES") is not None else None)
 
 if os.environ.get("SPARK_EXECUTOR_URI"):
-    SparkContext.setSystemProperty(
-        "spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
+    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
 
 sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
 
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 059cf1633a86f..293af6183e9cf 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -983,8 +983,7 @@ def registerFunction(self, name, f, returnType=StringType()):
         includes = ListConverter().convert(self._sc._python_includes,
                                            self._sc._gateway._gateway_client)
         self._ssql_ctx.registerPython(name,
-                                      bytearray(
-                                          CloudPickleSerializer().dumps(command)),
+                                      bytearray(CloudPickleSerializer().dumps(command)),
                                       env,
                                       includes,
                                       self._sc.pythonExec,
@@ -1530,8 +1529,7 @@ def registerTempTable(self, name):
         self._jschema_rdd.registerTempTable(name)
 
     def registerAsTable(self, name):
-        warnings.warn(
-            "Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
         self.registerTempTable(name)
 
     def insertInto(self, tableName, overwrite=False):
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 8b59d62bb40c3..1e597d64e03fe 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -51,15 +51,13 @@ def merge(self, value):
 
         return self
 
-    # Merge another StatCounter into this one, adding up the internal
-    # statistics.
+    # Merge another StatCounter into this one, adding up the internal statistics.
     def mergeStats(self, other):
         if not isinstance(other, StatCounter):
             raise Exception("Can only merge Statcounters!")
 
         if other is self:  # reference equality holds
-            # Avoid overwriting fields in a weird order
-            self.merge(copy.deepcopy(other))
+            self.merge(copy.deepcopy(other))  # Avoid overwriting fields in a weird order
         else:
             if self.n == 0:
                 self.mu = other.mu
@@ -75,14 +73,12 @@ def mergeStats(self, other):
                 elif self.n * 10 < other.n:
                     self.mu = other.mu - (delta * self.n) / (self.n + other.n)
                 else:
-                    self.mu = (
-                        self.mu * self.n + other.mu * other.n) / (self.n + other.n)
+                    self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
 
                 self.maxValue = maximum(self.maxValue, other.maxValue)
                 self.minValue = minimum(self.minValue, other.minValue)
 
-                self.m2 += other.m2 + \
-                    (delta * delta * self.n * other.n) / (self.n + other.n)
+                self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
                 self.n += other.n
         return self
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index ec2553ae46649..199f66392d1ac 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -227,12 +227,10 @@ def test_add_egg_file_locally(self):
         def func():
             from userlib import UserClass
         self.assertRaises(ImportError, func)
-        path = os.path.join(
-            SPARK_HOME, "python/test_support/userlib-0.1-py2.7.egg")
+        path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1-py2.7.egg")
         self.sc.addPyFile(path)
         from userlib import UserClass
-        self.assertEqual(
-            "Hello World from inside a package!", UserClass().hello())
+        self.assertEqual("Hello World from inside a package!", UserClass().hello())
 
 
 class TestRDDFunctions(PySparkTestCase):
@@ -240,8 +238,7 @@ class TestRDDFunctions(PySparkTestCase):
     def test_failed_sparkcontext_creation(self):
         # Regression test for SPARK-1550
         self.sc.stop()
-        self.assertRaises(
-            Exception, lambda: SparkContext("an-invalid-master-name"))
+        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
         self.sc = SparkContext("local")
 
     def test_save_as_textfile_with_unicode(self):
@@ -338,8 +335,7 @@ def setUp(self):
         PySparkTestCase.setUp(self)
         self.tempdir = tempfile.NamedTemporaryFile(delete=False)
         os.unlink(self.tempdir.name)
-        self.sc._jvm.WriteInputFormatTestDataGenerator.generateData(
-            self.tempdir.name, self.sc._jsc)
+        self.sc._jvm.WriteInputFormatTestDataGenerator.generateData(self.tempdir.name, self.sc._jsc)
 
     def tearDown(self):
         PySparkTestCase.tearDown(self)
@@ -350,15 +346,13 @@ def test_sequencefiles(self):
         ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/",
                                            "org.apache.hadoop.io.IntWritable",
                                            "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
-              (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.assertEqual(ints, ei)
 
         doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/",
                                               "org.apache.hadoop.io.DoubleWritable",
                                               "org.apache.hadoop.io.Text").collect())
-        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'),
-              (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
+        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
         self.assertEqual(doubles, ed)
 
         bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/",
@@ -386,8 +380,7 @@ def test_sequencefiles(self):
         bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/",
                                             "org.apache.hadoop.io.IntWritable",
                                             "org.apache.hadoop.io.BooleanWritable").collect())
-        eb = [(1, False), (1, True), (2, False),
-              (2, False), (2, True), (3, True)]
+        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
         self.assertEqual(bools, eb)
 
         nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/",
@@ -447,8 +440,7 @@ def test_oldhadoop(self):
                                          "org.apache.hadoop.mapred.SequenceFileInputFormat",
                                          "org.apache.hadoop.io.IntWritable",
                                          "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
-              (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.assertEqual(ints, ei)
 
         hellopath = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
@@ -467,8 +459,7 @@ def test_newhadoop(self):
             "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
             "org.apache.hadoop.io.IntWritable",
             "org.apache.hadoop.io.Text").collect())
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
-              (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.assertEqual(ints, ei)
 
         hellopath = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
@@ -541,21 +532,18 @@ def tearDown(self):
 
     def test_sequencefiles(self):
         basepath = self.tempdir.name
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
-              (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/")
         ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect())
         self.assertEqual(ints, ei)
 
-        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'),
-              (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
+        ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
         self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/")
         doubles = sorted(
             self.sc.sequenceFile(basepath + "/sfdouble/").collect())
         self.assertEqual(doubles, ed)
 
-        ebs = [(1, bytearray(b'\x00\x07spam\x08')),
-               (2, bytearray(b'\x00\x07spam\x08'))]
+        ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))]
         self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/")
         bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect())
         self.assertEqual(bytes, ebs)
@@ -567,8 +555,7 @@ def test_sequencefiles(self):
         text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect())
         self.assertEqual(text, et)
 
-        eb = [(1, False), (1, True), (2, False),
-              (2, False), (2, True), (3, True)]
+        eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)]
         self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/")
         bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect())
         self.assertEqual(bools, eb)
@@ -686,8 +673,7 @@ def test_converters(self):
             "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
             keyConverter="org.apache.spark.api.python.TestOutputKeyConverter",
             valueConverter="org.apache.spark.api.python.TestOutputValueConverter")
-        converted = sorted(
-            self.sc.sequenceFile(basepath + "/converters/").collect())
+        converted = sorted(self.sc.sequenceFile(basepath + "/converters/").collect())
         expected = [(u'1', 3.0),
                     (u'2', 1.0),
                     (u'3', 2.0)]
@@ -700,21 +686,19 @@ def test_reserialization(self):
         data = zip(x, y)
         rdd = self.sc.parallelize(x).zip(self.sc.parallelize(y))
         rdd.saveAsSequenceFile(basepath + "/reserialize/sequence")
-        result1 = sorted(
-            self.sc.sequenceFile(basepath + "/reserialize/sequence").collect())
+        result1 = sorted(self.sc.sequenceFile(basepath + "/reserialize/sequence").collect())
         self.assertEqual(result1, data)
 
-        rdd.saveAsHadoopFile(basepath + "/reserialize/hadoop",
-                             "org.apache.hadoop.mapred.SequenceFileOutputFormat")
-        result2 = sorted(
-            self.sc.sequenceFile(basepath + "/reserialize/hadoop").collect())
+        rdd.saveAsHadoopFile(
+            basepath + "/reserialize/hadoop",
+            "org.apache.hadoop.mapred.SequenceFileOutputFormat")
+        result2 = sorted(self.sc.sequenceFile(basepath + "/reserialize/hadoop").collect())
         self.assertEqual(result2, data)
 
         rdd.saveAsNewAPIHadoopFile(
             basepath + "/reserialize/newhadoop",
             "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
-        result3 = sorted(
-            self.sc.sequenceFile(basepath + "/reserialize/newhadoop").collect())
+        result3 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newhadoop").collect())
         self.assertEqual(result3, data)
 
         conf4 = {
@@ -723,8 +707,7 @@ def test_reserialization(self):
             "mapred.output.value.class": "org.apache.hadoop.io.IntWritable",
             "mapred.output.dir": basepath + "/reserialize/dataset"}
         rdd.saveAsHadoopDataset(conf4)
-        result4 = sorted(
-            self.sc.sequenceFile(basepath + "/reserialize/dataset").collect())
+        result4 = sorted(self.sc.sequenceFile(basepath + "/reserialize/dataset").collect())
         self.assertEqual(result4, data)
 
         conf5 = {"mapreduce.outputformat.class":
@@ -733,14 +716,12 @@ def test_reserialization(self):
                  "mapred.output.value.class": "org.apache.hadoop.io.IntWritable",
                  "mapred.output.dir": basepath + "/reserialize/newdataset"}
         rdd.saveAsNewAPIHadoopDataset(conf5)
-        result5 = sorted(
-            self.sc.sequenceFile(basepath + "/reserialize/newdataset").collect())
+        result5 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newdataset").collect())
         self.assertEqual(result5, data)
 
     def test_unbatched_save_and_read(self):
         basepath = self.tempdir.name
-        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'),
-              (2, u'bb'), (2, u'bb'), (3, u'cc')]
+        ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
         self.sc.parallelize(ei, numSlices=len(ei)).saveAsSequenceFile(
             basepath + "/unbatched/")
 
@@ -897,8 +878,7 @@ class TestSparkSubmit(unittest.TestCase):
 
     def setUp(self):
         self.programDir = tempfile.mkdtemp()
-        self.sparkSubmit = os.path.join(
-            os.environ.get("SPARK_HOME"), "bin", "spark-submit")
+        self.sparkSubmit = os.path.join(os.environ.get("SPARK_HOME"), "bin", "spark-submit")
 
     def tearDown(self):
         shutil.rmtree(self.programDir)
@@ -935,8 +915,7 @@ def test_single_script(self):
             |sc = SparkContext()
             |print sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect()
             """)
-        proc = subprocess.Popen(
-            [self.sparkSubmit, script], stdout=subprocess.PIPE)
+        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 4, 6]", out)
@@ -952,8 +931,7 @@ def test_script_with_local_functions(self):
             |sc = SparkContext()
             |print sc.parallelize([1, 2, 3]).map(foo).collect()
             """)
-        proc = subprocess.Popen(
-            [self.sparkSubmit, script], stdout=subprocess.PIPE)
+        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[3, 6, 9]", out)
@@ -971,8 +949,9 @@ def test_module_dependency(self):
             |def myfunc(x):
             |    return x + 1
             """)
-        proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script],
-                                stdout=subprocess.PIPE)
+        proc = subprocess.Popen(
+            [self.sparkSubmit, "--py-files", zip, script],
+            stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 3, 4]", out)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 14a63256b551f..5c06aad09329f 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -56,8 +56,7 @@ def main(infile, outfile):
         SparkFiles._root_directory = spark_files_dir
         SparkFiles._is_running_on_worker = True
 
-        # fetch names of includes (*.zip and *.egg files) and construct
-        # PYTHONPATH
+        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
         # *.py files that were added will be copied here
         sys.path.append(spark_files_dir)
         num_python_includes = read_int(infile)

From 7b4750eed89221d54c029618712d2e8862fe2110 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 19:31:13 -0400
Subject: [PATCH 20/23] merge upstream changes

---
 python/pyspark/serializers.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 5b6fa2c609537..9e54e3551d8fe 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -302,8 +302,11 @@ def __reduce__(self):
 
 def _hijack_namedtuple():
     """ Hack namedtuple() to make it picklable """
-    global _old_namedtuple  # or it will put in closure
+    # hijack only one time
+    if hasattr(collections.namedtuple, "__hijack"):
+        return
 
+    global _old_namedtuple # or it will put in closure
     def _copy_func(f):
         return types.FunctionType(f.func_code, f.func_globals, f.func_name,
                                   f.func_defaults, f.func_closure)
@@ -318,6 +321,7 @@ def namedtuple(name, fields, verbose=False, rename=False):
     collections.namedtuple.func_globals["_old_namedtuple"] = _old_namedtuple
     collections.namedtuple.func_globals["_hack_namedtuple"] = _hack_namedtuple
     collections.namedtuple.func_code = namedtuple.func_code
+    collections.namedtuple.__hijack = 1
 
     # hack the cls already generated by namedtuple
     # those created in other module can be pickled as normal,

From bf3094281077c9875a277929b4703525306d7bf0 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 19:38:30 -0400
Subject: [PATCH 21/23] [SPARK-2627] PEP8: comment spacing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PEP 8 is the bee’s knees.
---
 python/pyspark/serializers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 9e54e3551d8fe..b35558db3e007 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -306,7 +306,8 @@ def _hijack_namedtuple():
     if hasattr(collections.namedtuple, "__hijack"):
         return
 
-    global _old_namedtuple # or it will put in closure
+    global _old_namedtuple  # or it will put in closure
+
     def _copy_func(f):
         return types.FunctionType(f.func_code, f.func_globals, f.func_name,
                                   f.func_defaults, f.func_closure)

From 0e0245f2356bba84a4c7df90fcbcd7dc51448ede Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 4 Aug 2014 20:08:17 -0400
Subject: [PATCH 22/23] [SPARK-2627] undo erroneous whitespace fixes

---
 python/pyspark/context.py |  4 ++--
 python/pyspark/tests.py   | 45 ++++++++++++++++++---------------------
 python/pyspark/worker.py  |  3 +--
 3 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 840925d3e31e9..4001ecab5ea00 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -59,8 +59,8 @@ class SparkContext(object):
     _writeToFile = None
     _next_accum_id = 0
     _active_spark_context = None
-    _lock = Lock()  # zip and egg files that need to be added to PYTHONPATH
-    _python_includes = None
+    _lock = Lock()
+    _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
     _default_batch_size_for_serialized_input = 10
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 199f66392d1ac..da580f3ecb3bb 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -539,8 +539,7 @@ def test_sequencefiles(self):
 
         ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')]
         self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/")
-        doubles = sorted(
-            self.sc.sequenceFile(basepath + "/sfdouble/").collect())
+        doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect())
         self.assertEqual(doubles, ed)
 
         ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))]
@@ -725,25 +724,25 @@ def test_unbatched_save_and_read(self):
         self.sc.parallelize(ei, numSlices=len(ei)).saveAsSequenceFile(
             basepath + "/unbatched/")
 
-        unbatched_sequence = sorted(self.sc.sequenceFile(basepath + "/unbatched/",
-                                                         batchSize=1).collect())
+        unbatched_sequence = sorted(self.sc.sequenceFile(
+            basepath + "/unbatched/",
+            batchSize=1).collect())
         self.assertEqual(unbatched_sequence, ei)
 
-        unbatched_hadoopFile = sorted(
-            self.sc.hadoopFile(basepath + "/unbatched/",
-                               "org.apache.hadoop.mapred.SequenceFileInputFormat",
-                               "org.apache.hadoop.io.IntWritable",
-                               "org.apache.hadoop.io.Text",
-                               batchSize=1).collect())
+        unbatched_hadoopFile = sorted(self.sc.hadoopFile(
+            basepath + "/unbatched/",
+            "org.apache.hadoop.mapred.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.Text",
+            batchSize=1).collect())
         self.assertEqual(unbatched_hadoopFile, ei)
 
-        unbatched_newAPIHadoopFile = sorted(
-            self.sc.newAPIHadoopFile(
-                basepath + "/unbatched/",
-                "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
-                "org.apache.hadoop.io.IntWritable",
-                "org.apache.hadoop.io.Text",
-                batchSize=1).collect())
+        unbatched_newAPIHadoopFile = sorted(self.sc.newAPIHadoopFile(
+            basepath + "/unbatched/",
+            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
+            "org.apache.hadoop.io.IntWritable",
+            "org.apache.hadoop.io.Text",
+            batchSize=1).collect())
         self.assertEqual(unbatched_newAPIHadoopFile, ei)
 
         oldconf = {"mapred.input.dir": basepath + "/unbatched/"}
@@ -949,9 +948,8 @@ def test_module_dependency(self):
             |def myfunc(x):
             |    return x + 1
             """)
-        proc = subprocess.Popen(
-            [self.sparkSubmit, "--py-files", zip, script],
-            stdout=subprocess.PIPE)
+        proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script],
+                                stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 3, 4]", out)
@@ -969,10 +967,9 @@ def test_module_dependency_on_cluster(self):
             |def myfunc(x):
             |    return x + 1
             """)
-        proc = subprocess.Popen(
-            [self.sparkSubmit, "--py-files", zip, "--master",
-                "local-cluster[1,1,512]", script],
-            stdout=subprocess.PIPE)
+        proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, "--master",
+                                "local-cluster[1,1,512]", script],
+                                stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 3, 4]", out)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 5c06aad09329f..2770f63059853 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -57,8 +57,7 @@ def main(infile, outfile):
         SparkFiles._is_running_on_worker = True
 
         # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-        # *.py files that were added will be copied here
-        sys.path.append(spark_files_dir)
+        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
         num_python_includes = read_int(infile)
         for _ in range(num_python_includes):
             filename = utf8_deserializer.loads(infile)

From 274b2384bb28f03eb034686e6e730e73af3ffaf3 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 6 Aug 2014 14:29:55 -0400
Subject: [PATCH 23/23] [SPARK-2627] [PySpark] minor indentation changes

---
 python/pyspark/tests.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index da580f3ecb3bb..88a61176e51ab 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -594,7 +594,8 @@ def test_oldhadoop(self):
             "mapred.output.format.class": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
             "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
             "mapred.output.value.class": "org.apache.hadoop.io.MapWritable",
-            "mapred.output.dir": basepath + "/olddataset/"}
+            "mapred.output.dir": basepath + "/olddataset/"
+        }
         self.sc.parallelize(dict_data).saveAsHadoopDataset(conf)
         input_conf = {"mapred.input.dir": basepath + "/olddataset/"}
         old_dataset = sorted(self.sc.hadoopRDD(
@@ -624,11 +625,13 @@ def test_newhadoop(self):
             valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect())
         self.assertEqual(result, array_data)
 
-        conf = {"mapreduce.outputformat.class":
+        conf = {
+            "mapreduce.outputformat.class":
                 "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
-                "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
-                "mapred.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable",
-                "mapred.output.dir": basepath + "/newdataset/"}
+            "mapred.output.key.class": "org.apache.hadoop.io.IntWritable",
+            "mapred.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable",
+            "mapred.output.dir": basepath + "/newdataset/"
+        }
         self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset(
             conf,
             valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter")
@@ -1012,8 +1015,7 @@ class NumPyTests(PySparkTestCase):
     """General PySpark tests that depend on numpy """
 
     def test_statcounter_array(self):
-        x = self.sc.parallelize(
-            [np.array([1.0, 1.0]), np.array([2.0, 2.0]), np.array([3.0, 3.0])])
+        x = self.sc.parallelize([np.array([1.0, 1.0]), np.array([2.0, 2.0]), np.array([3.0, 3.0])])
         s = x.stats()
         self.assertSequenceEqual([2.0, 2.0], s.mean().tolist())
         self.assertSequenceEqual([1.0, 1.0], s.min().tolist())