diff --git a/.rat-excludes b/.rat-excludes
index 9165872b9fb27..08fba6d351d6a 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -15,20 +15,8 @@ TAGS
 RELEASE
 control
 docs
-docker.properties.template
-fairscheduler.xml.template
-spark-defaults.conf.template
-log4j.properties
-log4j.properties.template
-metrics.properties
-metrics.properties.template
 slaves
-slaves.template
-spark-env.sh
 spark-env.cmd
-spark-env.sh.template
-log4j-defaults.properties
-log4j-defaults-repl.properties
 bootstrap-tooltip.js
 jquery-1.11.1.min.js
 d3.min.js
diff --git a/LICENSE b/LICENSE
index f9e412cade345..dca03ab16de29 100644
--- a/LICENSE
+++ b/LICENSE
@@ -211,712 +211,45 @@ subcomponents is subject to the terms and conditions of the following
 licenses.
 
 
-=======================================================================
-For the Boto EC2 library (ec2/third_party/boto*.zip):
-=======================================================================
-
-Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish, dis-
-tribute, sublicense, and/or sell copies of the Software, and to permit
-persons to whom the Software is furnished to do so, subject to the fol-
-lowing conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
-ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
-
-
-========================================================================
-For CloudPickle (pyspark/cloudpickle.py):
-========================================================================
-
-Copyright (c) 2012, Regents of the University of California.
-Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of the University of California, Berkeley nor the
-      names of its contributors may be used to endorse or promote
-      products derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-========================================================================
-For Py4J (python/lib/py4j-0.8.2.1-src.zip)
-========================================================================
-
-Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-- The name of the author may not be used to endorse or promote products
-derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-========================================================================
-For DPark join code (python/pyspark/join.py):
-========================================================================
-
-Copyright (c) 2011, Douban Inc. <http://www.douban.com/>
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-    * Neither the name of the Douban Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
 ========================================================================
 For heapq (pyspark/heapq3.py):
 ========================================================================
 
-# A. HISTORY OF THE SOFTWARE
-# ==========================
-#
-# Python was created in the early 1990s by Guido van Rossum at Stichting
-# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
-# as a successor of a language called ABC.  Guido remains Python's
-# principal author, although it includes many contributions from others.
-#
-# In 1995, Guido continued his work on Python at the Corporation for
-#     National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
-# in Reston, Virginia where he released several versions of the
-# software.
-#
-# In May 2000, Guido and the Python core development team moved to
-# BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
-# year, the PythonLabs team moved to Digital Creations (now Zope
-# Corporation, see http://www.zope.com).  In 2001, the Python Software
-# Foundation (PSF, see http://www.python.org/psf/) was formed, a
-# non-profit organization created specifically to own Python-related
-# Intellectual Property.  Zope Corporation is a sponsoring member of
-# the PSF.
-#
-# All Python releases are Open Source (see http://www.opensource.org for
-# the Open Source Definition).  Historically, most, but not all, Python
-# releases have also been GPL-compatible; the table below summarizes
-# the various releases.
-#
-# Release         Derived     Year        Owner       GPL-
-# from                                compatible? (1)
-#
-# 0.9.0 thru 1.2              1991-1995   CWI         yes
-# 1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
-# 1.6             1.5.2       2000        CNRI        no
-# 2.0             1.6         2000        BeOpen.com  no
-# 1.6.1           1.6         2001        CNRI        yes (2)
-# 2.1             2.0+1.6.1   2001        PSF         no
-# 2.0.1           2.0+1.6.1   2001        PSF         yes
-# 2.1.1           2.1+2.0.1   2001        PSF         yes
-# 2.2             2.1.1       2001        PSF         yes
-# 2.1.2           2.1.1       2002        PSF         yes
-# 2.1.3           2.1.2       2002        PSF         yes
-# 2.2.1           2.2         2002        PSF         yes
-# 2.2.2           2.2.1       2002        PSF         yes
-# 2.2.3           2.2.2       2003        PSF         yes
-# 2.3             2.2.2       2002-2003   PSF         yes
-# 2.3.1           2.3         2002-2003   PSF         yes
-# 2.3.2           2.3.1       2002-2003   PSF         yes
-# 2.3.3           2.3.2       2002-2003   PSF         yes
-# 2.3.4           2.3.3       2004        PSF         yes
-# 2.3.5           2.3.4       2005        PSF         yes
-# 2.4             2.3         2004        PSF         yes
-# 2.4.1           2.4         2005        PSF         yes
-# 2.4.2           2.4.1       2005        PSF         yes
-# 2.4.3           2.4.2       2006        PSF         yes
-# 2.4.4           2.4.3       2006        PSF         yes
-# 2.5             2.4         2006        PSF         yes
-# 2.5.1           2.5         2007        PSF         yes
-# 2.5.2           2.5.1       2008        PSF         yes
-# 2.5.3           2.5.2       2008        PSF         yes
-# 2.6             2.5         2008        PSF         yes
-# 2.6.1           2.6         2008        PSF         yes
-# 2.6.2           2.6.1       2009        PSF         yes
-# 2.6.3           2.6.2       2009        PSF         yes
-# 2.6.4           2.6.3       2009        PSF         yes
-# 2.6.5           2.6.4       2010        PSF         yes
-# 2.7             2.6         2010        PSF         yes
-#
-# Footnotes:
-#
-# (1) GPL-compatible doesn't mean that we're distributing Python under
-# the GPL.  All Python licenses, unlike the GPL, let you distribute
-# a modified version without making your changes open source.  The
-# GPL-compatible licenses make it possible to combine Python with
-#     other software that is released under the GPL; the others don't.
-#
-# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
-# because its license has a choice of law clause.  According to
-# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
-# is "not incompatible" with the GPL.
-#
-# Thanks to the many outside volunteers who have worked under Guido's
-# direction to make these releases possible.
-#
-#
-# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
-# ===============================================================
-#
-# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
-# --------------------------------------------
-#
-# 1. This LICENSE AGREEMENT is between the Python Software Foundation
-# ("PSF"), and the Individual or Organization ("Licensee") accessing and
-# otherwise using this software ("Python") in source or binary form and
-# its associated documentation.
-#
-# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
-# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
-# analyze, test, perform and/or display publicly, prepare derivative works,
-# distribute, and otherwise use Python alone or in any derivative version,
-# provided, however, that PSF's License Agreement and PSF's notice of copyright,
-# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained
-# in Python alone or in any derivative version prepared by Licensee.
-#
-# 3. In the event Licensee prepares a derivative work that is based on
-# or incorporates Python or any part thereof, and wants to make
-# the derivative work available to others as provided herein, then
-# Licensee hereby agrees to include in any such work a brief summary of
-# the changes made to Python.
-#
-# 4. PSF is making Python available to Licensee on an "AS IS"
-# basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
-# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
-# INFRINGE ANY THIRD PARTY RIGHTS.
-#
-# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
-# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-#
-# 6. This License Agreement will automatically terminate upon a material
-# breach of its terms and conditions.
-#
-# 7. Nothing in this License Agreement shall be deemed to create any
-# relationship of agency, partnership, or joint venture between PSF and
-# Licensee.  This License Agreement does not grant permission to use PSF
-# trademarks or trade name in a trademark sense to endorse or promote
-# products or services of Licensee, or any third party.
-#
-# 8. By copying, installing or otherwise using Python, Licensee
-# agrees to be bound by the terms and conditions of this License
-# Agreement.
-#
-#
-# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
-# -------------------------------------------
-#
-# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
-#
-# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
-# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
-# Individual or Organization ("Licensee") accessing and otherwise using
-# this software in source or binary form and its associated
-# documentation ("the Software").
-#
-# 2. Subject to the terms and conditions of this BeOpen Python License
-# Agreement, BeOpen hereby grants Licensee a non-exclusive,
-# royalty-free, world-wide license to reproduce, analyze, test, perform
-# and/or display publicly, prepare derivative works, distribute, and
-# otherwise use the Software alone or in any derivative version,
-# provided, however, that the BeOpen Python License is retained in the
-# Software, alone or in any derivative version prepared by Licensee.
-#
-# 3. BeOpen is making the Software available to Licensee on an "AS IS"
-# basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
-# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
-# INFRINGE ANY THIRD PARTY RIGHTS.
-#
-# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
-# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
-# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
-# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-#
-# 5. This License Agreement will automatically terminate upon a material
-# breach of its terms and conditions.
-#
-# 6. This License Agreement shall be governed by and interpreted in all
-# respects by the law of the State of California, excluding conflict of
-# law provisions.  Nothing in this License Agreement shall be deemed to
-# create any relationship of agency, partnership, or joint venture
-# between BeOpen and Licensee.  This License Agreement does not grant
-# permission to use BeOpen trademarks or trade names in a trademark
-# sense to endorse or promote products or services of Licensee, or any
-# third party.  As an exception, the "BeOpen Python" logos available at
-# http://www.pythonlabs.com/logos.html may be used according to the
-# permissions granted on that web page.
-#
-# 7. By copying, installing or otherwise using the software, Licensee
-# agrees to be bound by the terms and conditions of this License
-# Agreement.
-#
-#
-# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
-# ---------------------------------------
-#
-# 1. This LICENSE AGREEMENT is between the Corporation for National
-#     Research Initiatives, having an office at 1895 Preston White Drive,
-# Reston, VA 20191 ("CNRI"), and the Individual or Organization
-# ("Licensee") accessing and otherwise using Python 1.6.1 software in
-# source or binary form and its associated documentation.
-#
-# 2. Subject to the terms and conditions of this License Agreement, CNRI
-# hereby grants Licensee a nonexclusive, royalty-free, world-wide
-# license to reproduce, analyze, test, perform and/or display publicly,
-# prepare derivative works, distribute, and otherwise use Python 1.6.1
-# alone or in any derivative version, provided, however, that CNRI's
-# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
-# 1995-2001 Corporation for National Research Initiatives; All Rights
-# Reserved" are retained in Python 1.6.1 alone or in any derivative
-# version prepared by Licensee.  Alternately, in lieu of CNRI's License
-# Agreement, Licensee may substitute the following text (omitting the
-# quotes): "Python 1.6.1 is made available subject to the terms and
-# conditions in CNRI's License Agreement.  This Agreement together with
-# Python 1.6.1 may be located on the Internet using the following
-# unique, persistent identifier (known as a handle): 1895.22/1013.  This
-# Agreement may also be obtained from a proxy server on the Internet
-# using the following URL: http://hdl.handle.net/1895.22/1013".
-#
-# 3. In the event Licensee prepares a derivative work that is based on
-# or incorporates Python 1.6.1 or any part thereof, and wants to make
-# the derivative work available to others as provided herein, then
-# Licensee hereby agrees to include in any such work a brief summary of
-# the changes made to Python 1.6.1.
-#
-# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
-# basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
-# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
-# INFRINGE ANY THIRD PARTY RIGHTS.
-#
-# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
-# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-#
-# 6. This License Agreement will automatically terminate upon a material
-# breach of its terms and conditions.
-#
-# 7. This License Agreement shall be governed by the federal
-# intellectual property law of the United States, including without
-# limitation the federal copyright law, and, to the extent such
-# U.S. federal law does not apply, by the law of the Commonwealth of
-# Virginia, excluding Virginia's conflict of law provisions.
-# Notwithstanding the foregoing, with regard to derivative works based
-# on Python 1.6.1 that incorporate non-separable material that was
-# previously distributed under the GNU General Public License (GPL), the
-# law of the Commonwealth of Virginia shall govern this License
-# Agreement only as to issues arising under or with respect to
-# Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
-# License Agreement shall be deemed to create any relationship of
-# agency, partnership, or joint venture between CNRI and Licensee.  This
-# License Agreement does not grant permission to use CNRI trademarks or
-# trade name in a trademark sense to endorse or promote products or
-# services of Licensee, or any third party.
-#
-# 8. By clicking on the "ACCEPT" button where indicated, or by copying,
-# installing or otherwise using Python 1.6.1, Licensee agrees to be
-# bound by the terms and conditions of this License Agreement.
-#
-# ACCEPT
-#
-#
-# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
-# --------------------------------------------------
-#
-# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
-# The Netherlands.  All rights reserved.
-#
-# Permission to use, copy, modify, and distribute this software and its
-# documentation for any purpose and without fee is hereby granted,
-# provided that the above copyright notice appear in all copies and that
-# both that copyright notice and this permission notice appear in
-# supporting documentation, and that the name of Stichting Mathematisch
-# Centrum or CWI not be used in advertising or publicity pertaining to
-# distribution of the software without specific, written prior
-# permission.
-#
-# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
-# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
-# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
-# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-========================================================================
-For sorttable (core/src/main/resources/org/apache/spark/ui/static/sorttable.js):
-========================================================================
-
-Copyright (c) 1997-2007 Stuart Langridge
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-========================================================================
-For d3 (core/src/main/resources/org/apache/spark/ui/static/d3.min.js):
-========================================================================
-
-Copyright (c) 2010-2015, Michael Bostock
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* The name Michael Bostock may not be used to endorse or promote products
-  derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-========================================================================
-For Scala Interpreter classes (all .scala files in repl/src/main/scala
-except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
-and for SerializableMapWrapper in JavaUtils.scala:
-========================================================================
-
-Copyright (c) 2002-2013 EPFL
-Copyright (c) 2011-2013 Typesafe, Inc.
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-- Neither the name of the EPFL nor the names of its contributors may be
-  used to endorse or promote products derived from this software without
-  specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-========================================================================
-For sbt and sbt-launch-lib.bash in sbt/:
-========================================================================
-
-// Generated from http://www.opensource.org/licenses/bsd-license.php
-Copyright (c) 2011, Paul Phillips.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-    * Neither the name of the author nor the names of its contributors may be
-      used to endorse or promote products derived from this software without
-      specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+See license/LICENSE-heapq.txt
 
 ========================================================================
 For SnapTree:
 ========================================================================
 
-SNAPTREE LICENSE
-
-Copyright (c) 2009-2012 Stanford University, unless otherwise specified.
-All rights reserved.
-
-This software was developed by the Pervasive Parallelism Laboratory of
-Stanford University, California, USA.
-
-Permission to use, copy, modify, and distribute this software in source
-or binary form for any purpose with or without fee is hereby granted,
-provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-   3. Neither the name of Stanford University nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
-
-
-========================================================================
-For Timsort (core/src/main/java/org/apache/spark/util/collection/TimSort.java):
-========================================================================
-Copyright (C) 2008 The Android Open Source Project
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-========================================================================
-For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java):
-========================================================================
-Copyright (C) 2015 Stijn de Gouw
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-========================================================================
-For LimitedInputStream
-  (network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java):
-========================================================================
-Copyright (C) 2007 The Guava Authors
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-========================================================================
-For vis.js (core/src/main/resources/org/apache/spark/ui/static/vis.min.js):
-========================================================================
-Copyright (C) 2010-2015 Almende B.V.
-
-Vis.js is dual licensed under both
-
-  * The Apache 2.0 License
-    http://www.apache.org/licenses/LICENSE-2.0
-
-and
-
-  * The MIT License
-    http://opensource.org/licenses/MIT
-
-Vis.js may be distributed under either license.
+See license/LICENSE-SnapTree.txt
 
 ========================================================================
-For dagre-d3 (core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js):
+For jbcrypt:
 ========================================================================
-Copyright (c) 2013 Chris Pettitt
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
 
-========================================================================
-For graphlib-dot (core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js):
-========================================================================
-Copyright (c) 2012-2013 Chris Pettitt
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+See license/LICENSE-jbcrypt.txt
 
 ========================================================================
 BSD-style licenses
 ========================================================================
 
 The following components are provided under a BSD-style license. See project link for details.
+The text of each license is also included at licenses/LICENSE-[project].txt.
 
-     (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
+     (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
      (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
      (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.4 - http://jblas.org/)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
-     (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
      (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
      (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org)
-     (BSD style) Hamcrest Core (org.hamcrest:hamcrest-core:1.1 - no url defined)
+     (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
      (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net)
      (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer)
      (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer)
-     (BSD-like) (The BSD License) jline (org.scala-lang:jline:2.10.4 - http://www.scala-lang.org/)
+     (BSD 3 Clause) Scala (http://www.scala-lang.org/download/#License)
+        (Interpreter classes (all .scala files in repl/src/main/scala
+        except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
+        and for SerializableMapWrapper in JavaUtils.scala)
      (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.10.4 - http://www.scala-lang.org/)
      (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.10.4 - http://www.scala-lang.org/)
      (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.10.4 - http://www.scala-lang.org/)
@@ -934,13 +267,17 @@ The following components are provided under a BSD-style license. See project lin
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
      (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
-     (ISC/BSD License) jbcrypt (org.mindrot:jbcrypt:0.3m - http://www.mindrot.org/)
+     (BSD licence) sbt and sbt-launch-lib.bash
+     (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
+     (BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE)
+     (BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE)
 
 ========================================================================
 MIT licenses
 ========================================================================
 
 The following components are provided under the MIT License. See project link for details.
+The text of each license is also included at licenses/LICENSE-[project].txt.
 
      (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org)
      (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org)
@@ -951,3 +288,7 @@ The following components are provided under the MIT License. See project link fo
      (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org)
      (MIT License) jquery (https://jquery.org/license/)
      (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)
+     (MIT License) graphlib-dot (https://github.com/cpettitt/graphlib-dot)
+     (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3)
+     (MIT License) sorttable (https://github.com/stuartlangridge/sorttable)
+     (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE)
diff --git a/NOTICE b/NOTICE
index 452aef2871652..7f7769f73047f 100644
--- a/NOTICE
+++ b/NOTICE
@@ -572,3 +572,38 @@ Copyright 2009-2013 The Apache Software Foundation
 
 Apache Avro IPC
 Copyright 2009-2013 The Apache Software Foundation
+
+
+Vis.js
+Copyright 2010-2015 Almende B.V.
+
+Vis.js is dual licensed under both
+
+  * The Apache 2.0 License
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    and
+
+  * The MIT License
+    http://opensource.org/licenses/MIT
+
+Vis.js may be distributed under either license.
+
+
+Vis.js uses and redistributes the following third-party libraries:
+
+- component-emitter
+  https://github.com/component/emitter
+  The MIT License
+
+- hammer.js
+  http://hammerjs.github.io/
+  The MIT License
+
+- moment.js
+  http://momentjs.com/
+  The MIT License
+
+- keycharm
+  https://github.com/AlexDM0/keycharm
+  The MIT License
\ No newline at end of file
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d0d7201f004a2..a3a16c42a6214 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SparkR
 Type: Package
 Title: R frontend for Spark
-Version: 1.5.0
+Version: 1.6.0
 Date: 2013-09-09
 Author: The Apache Software Foundation
 Maintainer: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5286c01986204..c28c47daeac13 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -69,9 +69,11 @@ exportMethods("arrange",
               "selectExpr",
               "show",
               "showDF",
+              "subset",
               "summarize",
               "summary",
               "take",
+              "transform",
               "unionAll",
               "unique",
               "unpersist",
@@ -245,3 +247,5 @@ export("structField",
        "structType.jobj",
        "structType.structField",
        "print.structType")
+
+export("as.data.frame")
\ No newline at end of file
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 74de7c81e35a6..65e368c47dd81 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -271,7 +271,7 @@ setMethod("names<-",
           signature(x = "DataFrame"),
           function(x, value) {
             if (!is.null(value)) {
-              sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
+              sdf <- callJMethod(x@sdf, "toDF", as.list(value))
               dataFrame(sdf)
             }
           })
@@ -843,10 +843,10 @@ setMethod("groupBy",
            function(x, ...) {
              cols <- list(...)
              if (length(cols) >= 1 && class(cols[[1]]) == "character") {
-               sgd <- callJMethod(x@sdf, "groupBy", cols[[1]], listToSeq(cols[-1]))
+               sgd <- callJMethod(x@sdf, "groupBy", cols[[1]], cols[-1])
              } else {
                jcol <- lapply(cols, function(c) { c@jc })
-               sgd <- callJMethod(x@sdf, "groupBy", listToSeq(jcol))
+               sgd <- callJMethod(x@sdf, "groupBy", jcol)
              }
              groupedData(sgd)
            })
@@ -987,7 +987,7 @@ setMethod("$<-", signature(x = "DataFrame"),
 
 setClassUnion("numericOrcharacter", c("numeric", "character"))
 
-#' @rdname select
+#' @rdname subset
 #' @name [[
 setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
           function(x, i) {
@@ -998,7 +998,7 @@ setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
             getColumn(x, i)
           })
 
-#' @rdname select
+#' @rdname subset
 #' @name [
 setMethod("[", signature(x = "DataFrame", i = "missing"),
           function(x, i, j, ...) {
@@ -1012,7 +1012,7 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
             select(x, j)
           })
 
-#' @rdname select
+#' @rdname subset
 #' @name [
 setMethod("[", signature(x = "DataFrame", i = "Column"),
           function(x, i, j, ...) {
@@ -1020,12 +1020,43 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
             # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
             filtered <- filter(x, i)
             if (!missing(j)) {
-              filtered[, j]
+              filtered[, j, ...]
             } else {
               filtered
             }
           })
 
+#' Subset
+#'
+#' Return subsets of DataFrame according to given conditions
+#' @param x A DataFrame
+#' @param subset A logical expression to filter on rows
+#' @param select expression for the single Column or a list of columns to select from the DataFrame
+#' @return A new DataFrame containing only the rows that meet the condition with selected columns
+#' @export
+#' @rdname subset
+#' @name subset
+#' @aliases [
+#' @family subsetting functions
+#' @examples
+#' \dontrun{
+#'   # Columns can be selected using `[[` and `[`
+#'   df[[2]] == df[["age"]]
+#'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
+#'   # Or to filter rows
+#'   df[df$age > 20,]
+#'   # DataFrame can be subset on both rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
+#'   subset(df, df$age %in% c(19, 30), 1:2)
+#'   subset(df, df$age %in% c(19), select = c(1,2))
+#' }
+setMethod("subset", signature(x = "DataFrame"),
+          function(x, subset, select, ...) {
+            x[subset, select, ...]
+          })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -1034,6 +1065,8 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #' @return A new DataFrame with selected columns
 #' @export
 #' @rdname select
+#' @name select
+#' @family subsetting functions
 #' @examples
 #' \dontrun{
 #'   select(df, "*")
@@ -1041,19 +1074,12 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #'   select(df, df$name, df$age + 1)
 #'   select(df, c("col1", "col2"))
 #'   select(df, list(df$name, df$age + 1))
-#'   # Columns can also be selected using `[[` and `[`
-#'   df[[2]] == df[["age"]]
-#'   df[,2] == df[,"age"]
-#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
-#'   # It can also be subset on rows and Columns
-#'   df[df$name == "Smith", c(1,2)]
-#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
-            sdf <- callJMethod(x@sdf, "select", col, toSeq(...))
+            sdf <- callJMethod(x@sdf, "select", col, list(...))
             dataFrame(sdf)
           })
 
@@ -1064,7 +1090,7 @@ setMethod("select", signature(x = "DataFrame", col = "Column"),
             jcols <- lapply(list(col, ...), function(c) {
               c@jc
             })
-            sdf <- callJMethod(x@sdf, "select", listToSeq(jcols))
+            sdf <- callJMethod(x@sdf, "select", jcols)
             dataFrame(sdf)
           })
 
@@ -1080,7 +1106,7 @@ setMethod("select",
                 col(c)@jc
               }
             })
-            sdf <- callJMethod(x@sdf, "select", listToSeq(cols))
+            sdf <- callJMethod(x@sdf, "select", cols)
             dataFrame(sdf)
           })
 
@@ -1107,7 +1133,7 @@ setMethod("selectExpr",
           signature(x = "DataFrame", expr = "character"),
           function(x, expr, ...) {
             exprList <- list(expr, ...)
-            sdf <- callJMethod(x@sdf, "selectExpr", listToSeq(exprList))
+            sdf <- callJMethod(x@sdf, "selectExpr", exprList)
             dataFrame(sdf)
           })
 
@@ -1121,7 +1147,7 @@ setMethod("selectExpr",
 #' @return A DataFrame with the new column added.
 #' @rdname withColumn
 #' @name withColumn
-#' @aliases mutate
+#' @aliases mutate transform
 #' @export
 #' @examples
 #'\dontrun{
@@ -1141,11 +1167,12 @@ setMethod("withColumn",
 #'
 #' Return a new DataFrame with the specified columns added.
 #'
-#' @param x A DataFrame
+#' @param .data A DataFrame
 #' @param col a named argument of the form name = col
 #' @return A new DataFrame with the new columns added.
 #' @rdname withColumn
 #' @name mutate
+#' @aliases withColumn transform
 #' @export
 #' @examples
 #'\dontrun{
@@ -1155,10 +1182,12 @@ setMethod("withColumn",
 #' df <- jsonFile(sqlContext, path)
 #' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2)
 #' names(newDF) # Will contain newCol, newCol2
+#' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2)
 #' }
 setMethod("mutate",
-          signature(x = "DataFrame"),
-          function(x, ...) {
+          signature(.data = "DataFrame"),
+          function(.data, ...) {
+            x <- .data
             cols <- list(...)
             stopifnot(length(cols) > 0)
             stopifnot(class(cols[[1]]) == "Column")
@@ -1173,6 +1202,16 @@ setMethod("mutate",
             do.call(select, c(x, x$"*", cols))
           })
 
+#' @export
+#' @rdname withColumn
+#' @name transform
+#' @aliases withColumn mutate
+setMethod("transform",
+          signature(`_data` = "DataFrame"),
+          function(`_data`, ...) {
+            mutate(`_data`, ...)
+          })
+
 #' WithColumnRenamed
 #'
 #' Rename an existing column in a DataFrame.
@@ -1272,12 +1311,12 @@ setMethod("arrange",
           signature(x = "DataFrame", col = "characterOrColumn"),
           function(x, col, ...) {
             if (class(col) == "character") {
-              sdf <- callJMethod(x@sdf, "sort", col, toSeq(...))
+              sdf <- callJMethod(x@sdf, "sort", col, list(...))
             } else if (class(col) == "Column") {
               jcols <- lapply(list(col, ...), function(c) {
                 c@jc
               })
-              sdf <- callJMethod(x@sdf, "sort", listToSeq(jcols))
+              sdf <- callJMethod(x@sdf, "sort", jcols)
             }
             dataFrame(sdf)
           })
@@ -1300,6 +1339,7 @@ setMethod("orderBy",
 #' @return A DataFrame containing only the rows that meet the condition.
 #' @rdname filter
 #' @name filter
+#' @family subsetting functions
 #' @export
 #' @examples
 #'\dontrun{
@@ -1624,7 +1664,7 @@ setMethod("describe",
           signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
             colList <- list(col, ...)
-            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            sdf <- callJMethod(x@sdf, "describe", colList)
             dataFrame(sdf)
           })
 
@@ -1634,7 +1674,7 @@ setMethod("describe",
           signature(x = "DataFrame"),
           function(x) {
             colList <- as.list(c(columns(x)))
-            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            sdf <- callJMethod(x@sdf, "describe", colList)
             dataFrame(sdf)
           })
 
@@ -1691,7 +1731,7 @@ setMethod("dropna",
 
             naFunctions <- callJMethod(x@sdf, "na")
             sdf <- callJMethod(naFunctions, "drop",
-                               as.integer(minNonNulls), listToSeq(as.list(cols)))
+                               as.integer(minNonNulls), as.list(cols))
             dataFrame(sdf)
           })
 
@@ -1775,7 +1815,7 @@ setMethod("fillna",
             sdf <- if (length(cols) == 0) {
               callJMethod(naFunctions, "fill", value)
             } else {
-              callJMethod(naFunctions, "fill", value, listToSeq(as.list(cols)))
+              callJMethod(naFunctions, "fill", value, as.list(cols))
             }
             dataFrame(sdf)
           })
@@ -1808,3 +1848,28 @@ setMethod("crosstab",
             sct <- callJMethod(statFunctions, "crosstab", col1, col2)
             collect(dataFrame(sct))
           })
+
+
+#' This function downloads the contents of a DataFrame into an R's data.frame.
+#' Since data.frames are held in memory, ensure that you have enough memory
+#' in your system to accommodate the contents.
+#' 
+#' @title Download data from a DataFrame into a data.frame
+#' @param x a DataFrame
+#' @return a data.frame
+#' @rdname as.data.frame
+#' @examples \dontrun{
+#' 
+#' irisDF <- createDataFrame(sqlContext, iris)
+#' df <- as.data.frame(irisDF[irisDF$Species == "setosa", ])
+#' }
+setMethod("as.data.frame",
+          signature(x = "DataFrame"),
+          function(x, ...) {
+            # Check if additional parameters have been passed
+            if (length(list(...)) > 0) {
+              stop(paste("Unused argument(s): ", paste(list(...), collapse=", ")))
+            }
+            collect(x)
+          }
+)
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 1bc6445311473..1c58fd96d750a 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -41,15 +41,12 @@ infer_type <- function(x) {
   if (type == "map") {
     stopifnot(length(x) > 0)
     key <- ls(x)[[1]]
-    list(type = "map",
-         keyType = "string",
-         valueType = infer_type(get(key, x)),
-         valueContainsNull = TRUE)
+    paste0("map<string,", infer_type(get(key, x)), ">")
   } else if (type == "array") {
     stopifnot(length(x) > 0)
     names <- names(x)
     if (is.null(names)) {
-      list(type = "array", elementType = infer_type(x[[1]]), containsNull = TRUE)
+      paste0("array<", infer_type(x[[1]]), ">")
     } else {
       # StructType
       types <- lapply(x, infer_type)
@@ -59,7 +56,7 @@ infer_type <- function(x) {
       do.call(structType, fields)
     }
   } else if (length(x) > 1) {
-    list(type = "array", elementType = type, containsNull = TRUE)
+    paste0("array<", infer_type(x[[1]]), ">")
   } else {
     type
   }
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 4805096f3f9c5..42e9d12179db7 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -211,8 +211,7 @@ setMethod("cast",
 setMethod("%in%",
           signature(x = "Column"),
           function(x, table) {
-            table <- listToSeq(as.list(table))
-            jc <- callJMethod(x@jc, "in", table)
+            jc <- callJMethod(x@jc, "in", as.list(table))
             return(column(jc))
           })
 
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 6cf628e3007de..ce88d0b071b72 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -50,6 +50,7 @@ readTypedObject <- function(con, type) {
     "t" = readTime(con),
     "a" = readArray(con),
     "l" = readList(con),
+    "e" = readEnv(con),
     "n" = NULL,
     "j" = getJobj(readString(con)),
     stop(paste("Unsupported type for deserialization", type)))
@@ -57,8 +58,10 @@ readTypedObject <- function(con, type) {
 
 readString <- function(con) {
   stringLen <- readInt(con)
-  string <- readBin(con, raw(), stringLen, endian = "big")
-  rawToChar(string)
+  raw <- readBin(con, raw(), stringLen, endian = "big")
+  string <- rawToChar(raw)
+  Encoding(string) <- "UTF-8"
+  string
 }
 
 readInt <- function(con) {
@@ -119,6 +122,19 @@ readList <- function(con) {
   }
 }
 
+readEnv <- function(con) {
+  env <- new.env()
+  len <- readInt(con)
+  if (len > 0) {
+    for (i in 1:len) {
+      key <- readString(con)
+      value <- readObject(con)
+      env[[key]] <- value
+    }
+  }
+  env
+}
+
 readRaw <- function(con) {
   dataLen <- readInt(con)
   readBin(con, raw(), as.integer(dataLen), endian = "big")
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index d848730e70433..94687edb05442 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1331,7 +1331,7 @@ setMethod("countDistinct",
               x@jc
             })
             jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
-                              listToSeq(jcol))
+                              jcol)
             column(jc)
           })
 
@@ -1348,7 +1348,7 @@ setMethod("concat",
           signature(x = "Column"),
           function(x, ...) {
             jcols <- lapply(list(x, ...), function(x) { x@jc })
-            jc <- callJStatic("org.apache.spark.sql.functions", "concat", listToSeq(jcols))
+            jc <- callJStatic("org.apache.spark.sql.functions", "concat", jcols)
             column(jc)
           })
 
@@ -1366,7 +1366,7 @@ setMethod("greatest",
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
             jcols <- lapply(list(x, ...), function(x) { x@jc })
-            jc <- callJStatic("org.apache.spark.sql.functions", "greatest", listToSeq(jcols))
+            jc <- callJStatic("org.apache.spark.sql.functions", "greatest", jcols)
             column(jc)
           })
 
@@ -1384,7 +1384,7 @@ setMethod("least",
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
             jcols <- lapply(list(x, ...), function(x) { x@jc })
-            jc <- callJStatic("org.apache.spark.sql.functions", "least", listToSeq(jcols))
+            jc <- callJStatic("org.apache.spark.sql.functions", "least", jcols)
             column(jc)
           })
 
@@ -1675,7 +1675,7 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
 #' @export
 setMethod("concat_ws", signature(sep = "character", x = "Column"),
           function(sep, x, ...) {
-            jcols <- listToSeq(lapply(list(x, ...), function(x) { x@jc }))
+            jcols <- lapply(list(x, ...), function(x) { x@jc })
             jc <- callJStatic("org.apache.spark.sql.functions", "concat_ws", sep, jcols)
             column(jc)
           })
@@ -1723,7 +1723,7 @@ setMethod("expr", signature(x = "character"),
 #' @export
 setMethod("format_string", signature(format = "character", x = "Column"),
           function(format, x, ...) {
-            jcols <- listToSeq(lapply(list(x, ...), function(arg) { arg@jc }))
+            jcols <- lapply(list(x, ...), function(arg) { arg@jc })
             jc <- callJStatic("org.apache.spark.sql.functions",
                               "format_string",
                               format, jcols)
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index b578b8789d2c5..3db41e0fe2bb6 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -467,7 +467,7 @@ setGeneric("merge")
 
 #' @rdname withColumn
 #' @export
-setGeneric("mutate", function(x, ...) {standardGeneric("mutate") })
+setGeneric("mutate", function(.data, ...) {standardGeneric("mutate") })
 
 #' @rdname arrange
 #' @export
@@ -507,6 +507,10 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
   standardGeneric("saveAsTable")
 })
 
+#' @rdname withColumn
+#' @export
+setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
+
 #' @rdname write.df
 #' @export
 setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
@@ -531,6 +535,10 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr")
 #' @export
 setGeneric("showDF", function(x,...) { standardGeneric("showDF") })
 
+# @rdname subset
+# @export
+setGeneric("subset", function(x, subset, select, ...) { standardGeneric("subset") })
+
 #' @rdname agg
 #' @export
 setGeneric("summarize", function(x,...) { standardGeneric("summarize") })
@@ -975,3 +983,7 @@ setGeneric("glm")
 #' @rdname rbind
 #' @export
 setGeneric("rbind", signature = "...")
+
+#' @rdname as.data.frame
+#' @export
+setGeneric("as.data.frame")
\ No newline at end of file
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 576ac72f40fc0..4cab1a69f601a 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -102,7 +102,7 @@ setMethod("agg",
                 }
               }
               jcols <- lapply(cols, function(c) { c@jc })
-              sdf <- callJMethod(x@sgd, "agg", jcols[[1]], listToSeq(jcols[-1]))
+              sdf <- callJMethod(x@sgd, "agg", jcols[[1]], jcols[-1])
             } else {
               stop("agg can only support Column or character")
             }
@@ -124,7 +124,7 @@ createMethod <- function(name) {
   setMethod(name,
             signature(x = "GroupedData"),
             function(x, ...) {
-              sdf <- callJMethod(x@sgd, name, toSeq(...))
+              sdf <- callJMethod(x@sgd, name, list(...))
               dataFrame(sdf)
             })
 }
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index cea3d760d05fe..cd00bbbeec698 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', '+', '-', and '.'.
+#'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
@@ -41,7 +41,8 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' sqlContext <- sparkRSQL.init(sc)
 #' data(iris)
 #' df <- createDataFrame(sqlContext, iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df)
+#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
+#' summary(model)
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
           function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0) {
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 79c744ef29c23..8df1563f8ebc0 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -56,7 +56,7 @@ structType.structField <- function(x, ...) {
   })
   stObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
                        "createStructType",
-                       listToSeq(sfObjList))
+                       sfObjList)
   structType(stObj)
 }
 
@@ -114,6 +114,55 @@ structField.jobj <- function(x) {
   obj
 }
 
+checkType <- function(type) {
+  primtiveTypes <- c("byte",
+                     "integer",
+                     "float",
+                     "double",
+                     "numeric",
+                     "character",
+                     "string",
+                     "binary",
+                     "raw",
+                     "logical",
+                     "boolean",
+                     "timestamp",
+                     "date")
+  if (type %in% primtiveTypes) {
+    return()
+  } else {
+    # Check complex types
+    firstChar <- substr(type, 1, 1)
+    switch (firstChar,
+            a = {
+              # Array type
+              m <- regexec("^array<(.*)>$", type)
+              matchedStrings <- regmatches(type, m)
+              if (length(matchedStrings[[1]]) >= 2) {
+                elemType <- matchedStrings[[1]][2]
+                checkType(elemType)
+                return()
+              }
+            },
+            m = {
+              # Map type
+              m <- regexec("^map<(.*),(.*)>$", type)
+              matchedStrings <- regmatches(type, m)
+              if (length(matchedStrings[[1]]) >= 3) {
+                keyType <- matchedStrings[[1]][2]
+                if (keyType != "string" && keyType != "character") {
+                  stop("Key type in a map must be string or character")
+                }
+                valueType <- matchedStrings[[1]][3]
+                checkType(valueType)
+                return()
+              }
+            })
+  }
+
+  stop(paste("Unsupported type for Dataframe:", type))
+}
+
 structField.character <- function(x, type, nullable = TRUE) {
   if (class(x) != "character") {
     stop("Field name must be a string.")
@@ -124,28 +173,13 @@ structField.character <- function(x, type, nullable = TRUE) {
   if (class(nullable) != "logical") {
     stop("nullable must be either TRUE or FALSE")
   }
-  options <- c("byte",
-               "integer",
-               "float",
-               "double",
-               "numeric",
-               "character",
-               "string",
-               "binary",
-               "raw",
-               "logical",
-               "boolean",
-               "timestamp",
-               "date")
-  dataType <- if (type %in% options) {
-    type
-  } else {
-    stop(paste("Unsupported type for Dataframe:", type))
-  }
+
+  checkType(type)
+
   sfObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
                        "createStructField",
                        x,
-                       dataType,
+                       type,
                        nullable)
   structField(sfObj)
 }
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index e3676f57f907f..91e6b3e5609b5 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -79,7 +79,7 @@ writeJobj <- function(con, value) {
 writeString <- function(con, value) {
   utfVal <- enc2utf8(value)
   writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1))
-  writeBin(utfVal, con, endian = "big")
+  writeBin(utfVal, con, endian = "big", useBytes=TRUE)
 }
 
 writeInt <- function(con, value) {
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 3babcb519378e..69a2bc728f842 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -361,16 +361,6 @@ numToInt <- function(num) {
   as.integer(num)
 }
 
-# create a Seq in JVM
-toSeq <- function(...) {
-  callJStatic("org.apache.spark.sql.api.r.SQLUtils", "toSeq", list(...))
-}
-
-# create a Seq in JVM from a list
-listToSeq <- function(l) {
-  callJStatic("org.apache.spark.sql.api.r.SQLUtils", "toSeq", l)
-}
-
 # Utility function to recursively traverse the Abstract Syntax Tree (AST) of a
 # user defined function (UDF), and to examine variables in the UDF to decide
 # if their values should be included in the new function environment.
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index f272de78ad4a6..032f8ec68b9d0 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -49,6 +49,14 @@ test_that("dot minus and intercept vs native glm", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
+test_that("feature interaction vs native glm", {
+  training <- createDataFrame(sqlContext, iris)
+  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
@@ -57,5 +65,5 @@ test_that("summary coefficients match with native glm", {
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
   expect_true(all(
     as.character(stats$features) ==
-    c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica")))
+    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
 })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 933b11c8ee7e2..8f85eecbc4a97 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -49,26 +49,32 @@ mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
 jsonPathNa <- tempfile(pattern="sparkr-test", fileext=".tmp")
 writeLines(mockLinesNa, jsonPathNa)
 
-test_that("infer types", {
+# For test complex types in DataFrame
+mockLinesComplexType <-
+  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
+    "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
+    "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
+complexTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
+writeLines(mockLinesComplexType, complexTypeJsonPath)
+
+test_that("infer types and check types", {
   expect_equal(infer_type(1L), "integer")
   expect_equal(infer_type(1.0), "double")
   expect_equal(infer_type("abc"), "string")
   expect_equal(infer_type(TRUE), "boolean")
   expect_equal(infer_type(as.Date("2015-03-11")), "date")
   expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
-  expect_equal(infer_type(c(1L, 2L)),
-               list(type = "array", elementType = "integer", containsNull = TRUE))
-  expect_equal(infer_type(list(1L, 2L)),
-               list(type = "array", elementType = "integer", containsNull = TRUE))
+  expect_equal(infer_type(c(1L, 2L)), "array<integer>")
+  expect_equal(infer_type(list(1L, 2L)), "array<integer>")
   testStruct <- infer_type(list(a = 1L, b = "2"))
   expect_equal(class(testStruct), "structType")
   checkStructField(testStruct$fields()[[1]], "a", "IntegerType", TRUE)
   checkStructField(testStruct$fields()[[2]], "b", "StringType", TRUE)
   e <- new.env()
   assign("a", 1L, envir = e)
-  expect_equal(infer_type(e),
-               list(type = "map", keyType = "string", valueType = "integer",
-                    valueContainsNull = TRUE))
+  expect_equal(infer_type(e), "map<string,integer>")
+
+  expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
 })
 
 test_that("structType and structField", {
@@ -236,8 +242,7 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
-# TODO: enable this test after fix serialization for nested object
-#test_that("create DataFrame with nested array and struct", {
+test_that("create DataFrame with nested array and map", {
 #  e <- new.env()
 #  assign("n", 3L, envir = e)
 #  l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
@@ -247,7 +252,64 @@ test_that("create DataFrame with different data types", {
 #  expect_equal(count(df), 1)
 #  ldf <- collect(df)
 #  expect_equal(ldf[1,], l[[1]])
-#})
+
+  #  ArrayType and MapType
+  e <- new.env()
+  assign("n", 3L, envir = e)
+
+  l <- list(as.list(1:10), list("a", "b"), e)
+  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c"))
+  expect_equal(dtypes(df), list(c("a", "array<int>"),
+                                c("b", "array<string>"),
+                                c("c", "map<string,int>")))
+  expect_equal(count(df), 1)
+  ldf <- collect(df)
+  expect_equal(names(ldf), c("a", "b", "c"))
+  expect_equal(ldf[1, 1][[1]], l[[1]])
+  expect_equal(ldf[1, 2][[1]], l[[2]])
+  e <- ldf$c[[1]]
+  expect_equal(class(e), "environment")
+  expect_equal(ls(e), "n")
+  expect_equal(e$n, 3L)
+})
+
+# For test map type in DataFrame
+mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+                      "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+                      "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
+test_that("Collect DataFrame with complex types", {
+  # ArrayType
+  df <- jsonFile(sqlContext, complexTypeJsonPath)
+
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 3)
+  expect_equal(names(ldf), c("c1", "c2", "c3"))
+  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
+  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
+  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
+
+  # MapType
+  schema <- structType(structField("name", "string"),
+                       structField("info", "map<string,double>"))
+  df <- read.df(sqlContext, mapTypeJsonPath, "json", schema)
+  expect_equal(dtypes(df), list(c("name", "string"),
+                                c("info", "map<string,double>")))
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 2)
+  expect_equal(names(ldf), c("name", "info"))
+  expect_equal(ldf$name, c("Bob", "Alice", "David"))
+  bob <- ldf$info[[1]]
+  expect_equal(class(bob), "environment")
+  expect_equal(bob$age, 16)
+  expect_equal(bob$height, 176.5)
+
+  # TODO: tests for StructType after it is supported
+})
 
 test_that("jsonFile() on a local file returns a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
@@ -431,6 +493,32 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
   expect_equal(ncol(collect(df)), ncol(take(df, 10)))
 })
 
+test_that("collect() support Unicode characters", {
+  markUtf8 <- function(s) {
+    Encoding(s) <- "UTF-8"
+    s
+  }
+
+  lines <- c("{\"name\":\"안녕하세요\"}",
+             "{\"name\":\"您好\", \"age\":30}",
+             "{\"name\":\"こんにちは\", \"age\":19}",
+             "{\"name\":\"Xin chào\"}")
+
+  jsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
+  writeLines(lines, jsonPath)
+
+  df <- read.df(sqlContext, jsonPath, "json")
+  rdf <- collect(df)
+  expect_true(is.data.frame(rdf))
+  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
+  expect_equal(rdf$name[2], markUtf8("您好"))
+  expect_equal(rdf$name[3], markUtf8("こんにちは"))
+  expect_equal(rdf$name[4], markUtf8("Xin chào"))
+
+  df1 <- createDataFrame(sqlContext, rdf)
+  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+})
+
 test_that("multiple pipeline transformations result in an RDD with the correct values", {
   df <- jsonFile(sqlContext, jsonPath)
   first <- lapply(df, function(row) {
@@ -612,6 +700,10 @@ test_that("subsetting", {
   df5 <- df[df$age %in% c(19), c(1,2)]
   expect_equal(count(df5), 1)
   expect_equal(columns(df5), c("name", "age"))
+
+  df6 <- subset(df, df$age %in% c(30), c(1,2))
+  expect_equal(count(df6), 1)
+  expect_equal(columns(df6), c("name", "age"))
 })
 
 test_that("selectExpr() on a DataFrame", {
@@ -1028,7 +1120,7 @@ test_that("withColumn() and withColumnRenamed()", {
   expect_equal(columns(newDF2)[1], "newerAge")
 })
 
-test_that("mutate(), rename() and names()", {
+test_that("mutate(), transform(), rename() and names()", {
   df <- jsonFile(sqlContext, jsonPath)
   newDF <- mutate(df, newAge = df$age + 2)
   expect_equal(length(columns(newDF)), 3)
@@ -1042,6 +1134,20 @@ test_that("mutate(), rename() and names()", {
   names(newDF2) <- c("newerName", "evenNewerAge")
   expect_equal(length(names(newDF2)), 2)
   expect_equal(names(newDF2)[1], "newerName")
+
+  transformedDF <- transform(df, newAge = -df$age, newAge2 = df$age / 2)
+  expect_equal(length(columns(transformedDF)), 4)
+  expect_equal(columns(transformedDF)[3], "newAge")
+  expect_equal(columns(transformedDF)[4], "newAge2")
+  expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30)
+
+  # test if transform on local data frames works
+  # ensure the proper signature is used - otherwise this will fail to run
+  attach(airquality)
+  result <- transform(Ozone, logOzone = log(Ozone))
+  expect_equal(nrow(result), 153)
+  expect_equal(ncol(result), 2)
+  detach(airquality)
 })
 
 test_that("write.df() on DataFrame and works with parquetFile", {
@@ -1073,7 +1179,7 @@ test_that("describe() and summarize() on a DataFrame", {
   stats <- describe(df, "age")
   expect_equal(collect(stats)[1, "summary"], "count")
   expect_equal(collect(stats)[2, "age"], "24.5")
-  expect_equal(collect(stats)[3, "age"], "5.5")
+  expect_equal(collect(stats)[3, "age"], "7.7781745930520225")
   stats <- describe(df)
   expect_equal(collect(stats)[4, "name"], "Andy")
   expect_equal(collect(stats)[5, "age"], "30")
@@ -1221,6 +1327,13 @@ test_that("SQL error message is returned from JVM", {
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)
 })
 
+test_that("Method as.data.frame as a synonym for collect()", {
+  irisDF <- createDataFrame(sqlContext, iris)
+  expect_equal(as.data.frame(irisDF), collect(irisDF))
+  irisDF2 <- irisDF[irisDF$Species == "setosa", ]
+  expect_equal(as.data.frame(irisDF2), collect(irisDF2))
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
-unlink(jsonPathNa)
+unlink(jsonPathNa)
\ No newline at end of file
diff --git a/README.md b/README.md
index 380422ca00dbe..76e29b4235666 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Apache Spark
 
 Spark is a fast and general cluster computing system for Big Data. It provides
-high-level APIs in Scala, Java, and Python, and an optimized engine that
+high-level APIs in Scala, Java, Python, and R, and an optimized engine that
 supports general computation graphs for data analysis. It also supports a
 rich set of higher-level tools including Spark SQL for SQL and DataFrames,
 MLlib for machine learning, GraphX for graph processing,
@@ -94,5 +94,5 @@ distribution.
 
 ## Configuration
 
-Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
+Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
diff --git a/assembly/pom.xml b/assembly/pom.xml
index e9c6d26ccddc7..4b60ee00ffbe5 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bagel/pom.xml b/bagel/pom.xml
index ed5c37e595a96..3baf8d47b4dc7 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
index ef0bb2ac13f08..8399033ac61ec 100644
--- a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
+++ b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
@@ -22,6 +22,7 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 object Bagel extends Logging {
   val DEFAULT_STORAGE_LEVEL = StorageLevel.MEMORY_AND_DISK
 
@@ -78,7 +79,7 @@ object Bagel extends Logging {
       val startTime = System.currentTimeMillis
 
       val aggregated = agg(verts, aggregator)
-      val combinedMsgs = msgs.combineByKey(
+      val combinedMsgs = msgs.combineByKeyWithClassTag(
         combiner.createCombiner _, combiner.mergeMsg _, combiner.mergeCombiners _, partitioner)
       val grouped = combinedMsgs.groupWith(verts)
       val superstep_ = superstep  // Create a read-only copy of superstep for capture in closure
@@ -270,18 +271,21 @@ object Bagel extends Logging {
   }
 }
 
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Combiner[M, C] {
   def createCombiner(msg: M): C
   def mergeMsg(combiner: C, msg: M): C
   def mergeCombiners(a: C, b: C): C
 }
 
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Aggregator[V, A] {
   def createAggregator(vert: V): A
   def mergeAggregators(a: A, b: A): A
 }
 
 /** Default combiner that simply appends messages together (i.e. performs no aggregation) */
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 class DefaultCombiner[M: Manifest] extends Combiner[M, Array[M]] with Serializable {
   def createCombiner(msg: M): Array[M] =
     Array(msg)
@@ -297,6 +301,7 @@ class DefaultCombiner[M: Manifest] extends Combiner[M, Array[M]] with Serializab
  * Subclasses may store state along with each vertex and must
  * inherit from java.io.Serializable or scala.Serializable.
  */
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Vertex {
   def active: Boolean
 }
@@ -307,6 +312,7 @@ trait Vertex {
  * Subclasses may contain a payload to deliver to the target vertex
  * and must inherit from java.io.Serializable or scala.Serializable.
  */
+@deprecated("Uses of Bagel should migrate to GraphX", "1.6.0")
 trait Message[K] {
   def targetId: K
 }
diff --git a/bagel/src/test/resources/log4j.properties b/bagel/src/test/resources/log4j.properties
deleted file mode 100644
index edbecdae92096..0000000000000
--- a/bagel/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=INFO, file
-log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=true
-log4j.appender.file.file=target/unit-tests.log
-log4j.appender.file.layout=org.apache.log4j.PatternLayout
-log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
-
-# Ignore messages below warning level from Jetty, because it's a bit verbose
-log4j.logger.org.spark-project.jetty=WARN
diff --git a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala b/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
deleted file mode 100644
index fb10d734ac74b..0000000000000
--- a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.bagel
-
-import org.scalatest.{BeforeAndAfter, Assertions}
-import org.scalatest.concurrent.Timeouts
-import org.scalatest.time.SpanSugar._
-
-import org.apache.spark._
-import org.apache.spark.storage.StorageLevel
-
-class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable
-class TestMessage(val targetId: String) extends Message[String] with Serializable
-
-class BagelSuite extends SparkFunSuite with Assertions with BeforeAndAfter with Timeouts {
-
-  var sc: SparkContext = _
-
-  after {
-    if (sc != null) {
-      sc.stop()
-      sc = null
-    }
-  }
-
-  test("halting by voting") {
-    sc = new SparkContext("local", "test")
-    val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(true, 0))))
-    val msgs = sc.parallelize(Array[(String, TestMessage)]())
-    val numSupersteps = 5
-    val result =
-      Bagel.run(sc, verts, msgs, sc.defaultParallelism) {
-        (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-          (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]())
-      }
-    for ((id, vert) <- result.collect) {
-      assert(vert.age === numSupersteps)
-    }
-  }
-
-  test("halting by message silence") {
-    sc = new SparkContext("local", "test")
-    val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(false, 0))))
-    val msgs = sc.parallelize(Array("a" -> new TestMessage("a")))
-    val numSupersteps = 5
-    val result =
-      Bagel.run(sc, verts, msgs, sc.defaultParallelism) {
-        (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-          val msgsOut =
-            msgs match {
-              case Some(ms) if (superstep < numSupersteps - 1) =>
-                ms
-              case _ =>
-                Array[TestMessage]()
-            }
-        (new TestVertex(self.active, self.age + 1), msgsOut)
-      }
-    for ((id, vert) <- result.collect) {
-      assert(vert.age === numSupersteps)
-    }
-  }
-
-  test("large number of iterations") {
-    // This tests whether jobs with a large number of iterations finish in a reasonable time,
-    // because non-memoized recursion in RDD or DAGScheduler used to cause them to hang
-    failAfter(30 seconds) {
-      sc = new SparkContext("local", "test")
-      val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
-      val msgs = sc.parallelize(Array[(String, TestMessage)]())
-      val numSupersteps = 50
-      val result =
-        Bagel.run(sc, verts, msgs, sc.defaultParallelism) {
-          (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-            (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]())
-        }
-      for ((id, vert) <- result.collect) {
-        assert(vert.age === numSupersteps)
-      }
-    }
-  }
-
-  test("using non-default persistence level") {
-    failAfter(10 seconds) {
-      sc = new SparkContext("local", "test")
-      val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
-      val msgs = sc.parallelize(Array[(String, TestMessage)]())
-      val numSupersteps = 20
-      val result =
-        Bagel.run(sc, verts, msgs, sc.defaultParallelism, StorageLevel.DISK_ONLY) {
-          (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
-            (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]())
-        }
-      for ((id, vert) <- result.collect) {
-        assert(vert.age === numSupersteps)
-      }
-    }
-  }
-}
diff --git a/conf/docker.properties.template b/conf/docker.properties.template
index 26e3bfd9c5b9b..55cb094b4af46 100644
--- a/conf/docker.properties.template
+++ b/conf/docker.properties.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 spark.mesos.executor.docker.image: <image built from `../docker/spark-mesos/Dockerfile`>
 spark.mesos.executor.docker.volumes: /usr/local/lib:/host/usr/local/lib:ro
 spark.mesos.executor.home: /opt/spark
diff --git a/conf/fairscheduler.xml.template b/conf/fairscheduler.xml.template
index acf59e2a35986..385b2e772d2c8 100644
--- a/conf/fairscheduler.xml.template
+++ b/conf/fairscheduler.xml.template
@@ -1,4 +1,22 @@
 <?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
 <allocations>
   <pool name="production">
     <schedulingMode>FAIR</schedulingMode>
diff --git a/conf/log4j.properties.template b/conf/log4j.properties.template
index 74c5cea94403a..f3046be54d7c6 100644
--- a/conf/log4j.properties.template
+++ b/conf/log4j.properties.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Set everything to be logged to the console
 log4j.rootCategory=INFO, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender
diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index 7f17bc7eea4f5..d6962e0da2f30 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 #  syntax: [instance].sink|source.[name].[options]=[value]
 
 #  This file configures Spark's internal metrics system. The metrics system is
diff --git a/conf/slaves.template b/conf/slaves.template
index da0a01343d20a..be42a638230b7 100644
--- a/conf/slaves.template
+++ b/conf/slaves.template
@@ -1,2 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # A Spark Worker will be started on each of the machines listed below.
 localhost
\ No newline at end of file
diff --git a/conf/spark-defaults.conf.template b/conf/spark-defaults.conf.template
index a48dcc70e1363..19cba6e71ed19 100644
--- a/conf/spark-defaults.conf.template
+++ b/conf/spark-defaults.conf.template
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Default system properties included when running spark-submit.
 # This is useful for setting default environmental settings.
 
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index c05fe381a36a7..990ded420be72 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -1,5 +1,22 @@
 #!/usr/bin/env bash
 
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # This file is sourced when running various Spark programs.
 # Copy it as spark-env.sh and edit that to configure Spark for your site.
 
diff --git a/core/pom.xml b/core/pom.xml
index 4f79d71bf85fa..e31d90f608892 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -272,6 +272,14 @@
           <groupId>org.apache.hadoop</groupId>
           <artifactId>hadoop-client</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-framework</artifactId>
+        </exclusion>
         <exclusion>
           <groupId>org.apache.curator</groupId>
           <artifactId>curator-recipes</artifactId>
diff --git a/core/src/main/scala/org/apache/spark/annotation/AlphaComponent.java b/core/src/main/java/org/apache/spark/annotation/AlphaComponent.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/AlphaComponent.java
rename to core/src/main/java/org/apache/spark/annotation/AlphaComponent.java
diff --git a/core/src/main/scala/org/apache/spark/annotation/DeveloperApi.java b/core/src/main/java/org/apache/spark/annotation/DeveloperApi.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/DeveloperApi.java
rename to core/src/main/java/org/apache/spark/annotation/DeveloperApi.java
diff --git a/core/src/main/scala/org/apache/spark/annotation/Experimental.java b/core/src/main/java/org/apache/spark/annotation/Experimental.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/Experimental.java
rename to core/src/main/java/org/apache/spark/annotation/Experimental.java
diff --git a/core/src/main/scala/org/apache/spark/annotation/Private.java b/core/src/main/java/org/apache/spark/annotation/Private.java
similarity index 100%
rename from core/src/main/scala/org/apache/spark/annotation/Private.java
rename to core/src/main/java/org/apache/spark/annotation/Private.java
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index 0b8b604e18494..f5d80bbcf3557 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -151,7 +151,7 @@ public long[] writePartitionedFile(
         } finally {
           Closeables.close(in, copyThrewException);
         }
-        if (!blockManager.diskBlockManager().getFile(partitionWriters[i].blockId()).delete()) {
+        if (!partitionWriters[i].fileSegment().file().delete()) {
           logger.error("Unable to delete file for partition {}", i);
         }
       }
@@ -168,12 +168,11 @@ public long[] writePartitionedFile(
   public void stop() throws IOException {
     if (partitionWriters != null) {
       try {
-        final DiskBlockManager diskBlockManager = blockManager.diskBlockManager();
         for (DiskBlockObjectWriter writer : partitionWriters) {
           // This method explicitly does _not_ throw exceptions:
-          writer.revertPartialWritesAndClose();
-          if (!diskBlockManager.getFile(writer.blockId()).delete()) {
-            logger.error("Error while deleting file for block {}", writer.blockId());
+          File file = writer.revertPartialWritesAndClose();
+          if (!file.delete()) {
+            logger.error("Error while deleting file {}", file.getAbsolutePath());
           }
         }
       } finally {
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
index 3d1ef0c48adc5..e73ba39468828 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
@@ -122,6 +122,10 @@ public UnsafeShuffleExternalSorter(
     this.maxRecordSizeBytes = pageSizeBytes - 4;
     this.writeMetrics = writeMetrics;
     initializeForWriting();
+
+    // preserve first page to ensure that we have at least one page to work with. Otherwise,
+    // other operators in the same task may starve this sorter (SPARK-9709).
+    acquireNewPageIfNecessary(pageSizeBytes);
   }
 
   /**
diff --git a/core/src/main/java/org/apache/spark/util/collection/TimSort.java b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
index a90cc0e761f62..40b5fb7fe4b49 100644
--- a/core/src/main/java/org/apache/spark/util/collection/TimSort.java
+++ b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
@@ -15,6 +15,24 @@
  * limitations under the License.
  */
 
+/*
+ * Based on TimSort.java from the Android Open Source Project
+ *
+ *  Copyright (C) 2008 The Android Open Source Project
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
 package org.apache.spark.util.collection;
 
 import java.util.Comparator;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index fc364e0a895b1..0a311d2d935ac 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -160,15 +160,14 @@ public BoxedUnit apply() {
    * Allocates new sort data structures. Called when creating the sorter and after each spill.
    */
   private void initializeForWriting() throws IOException {
+    // Note: Do not track memory for the pointer array for now because of SPARK-10474.
+    // In more detail, in TungstenAggregate we only reserve a page, but when we fall back to
+    // sort-based aggregation we try to acquire a page AND a pointer array, which inevitably
+    // fails if all other memory is already occupied. It should be safe to not track the array
+    // because its memory footprint is frequently much smaller than that of a page. This is a
+    // temporary hack that we should address in 1.6.0.
+    // TODO: track the pointer array memory!
     this.writeMetrics = new ShuffleWriteMetrics();
-    final long pointerArrayMemory =
-      UnsafeInMemorySorter.getMemoryRequirementsForPointerArray(initialSize);
-    final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pointerArrayMemory);
-    if (memoryAcquired != pointerArrayMemory) {
-      shuffleMemoryManager.release(memoryAcquired);
-      throw new IOException("Could not acquire " + pointerArrayMemory + " bytes of memory");
-    }
-
     this.inMemSorter =
       new UnsafeInMemorySorter(taskMemoryManager, recordComparator, prefixComparator, initialSize);
     this.isInMemSorterExternal = false;
@@ -265,14 +264,7 @@ private long freeMemory() {
       shuffleMemoryManager.release(block.size());
       memoryFreed += block.size();
     }
-    if (inMemSorter != null) {
-      if (!isInMemSorterExternal) {
-        long sorterMemoryUsage = inMemSorter.getMemoryUsage();
-        memoryFreed += sorterMemoryUsage;
-        shuffleMemoryManager.release(sorterMemoryUsage);
-      }
-      inMemSorter = null;
-    }
+    // TODO: track in-memory sorter memory usage (SPARK-10474)
     allocatedPages.clear();
     currentPage = null;
     currentPagePosition = -1;
@@ -310,17 +302,8 @@ public void cleanupResources() {
   private void growPointerArrayIfNecessary() throws IOException {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
-      logger.debug("Attempting to expand sort pointer array");
-      final long oldPointerArrayMemoryUsage = inMemSorter.getMemoryUsage();
-      final long memoryToGrowPointerArray = oldPointerArrayMemoryUsage * 2;
-      final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryToGrowPointerArray);
-      if (memoryAcquired < memoryToGrowPointerArray) {
-        shuffleMemoryManager.release(memoryAcquired);
-        spill();
-      } else {
-        inMemSorter.expandPointerArray();
-        shuffleMemoryManager.release(oldPointerArrayMemoryUsage);
-      }
+      // TODO: track the pointer array memory! (SPARK-10474)
+      inMemSorter.expandPointerArray();
     }
   }
 
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 4989b05d63e23..501dfe77d13cb 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -24,12 +24,15 @@
 import org.apache.spark.storage.BlockId;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.unsafe.Platform;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Reads spill files written by {@link UnsafeSorterSpillWriter} (see that class for a description
  * of the file format).
  */
 final class UnsafeSorterSpillReader extends UnsafeSorterIterator {
+  private static final Logger logger = LoggerFactory.getLogger(UnsafeSorterSpillReader.class);
 
   private final File file;
   private InputStream in;
@@ -73,7 +76,9 @@ public void loadNext() throws IOException {
     numRecordsRemaining--;
     if (numRecordsRemaining == 0) {
       in.close();
-      file.delete();
+      if (!file.delete() && file.exists()) {
+        logger.warn("Unable to delete spill file {}", file.getPath());
+      }
       in = null;
       din = null;
     }
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
index 689afea64f8db..c85abc35b93bf 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Set everything to be logged to the console
 log4j.rootCategory=WARN, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults.properties b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
index 27006e45e932b..d44cc85dcbd82 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Set everything to be logged to the console
 log4j.rootCategory=INFO, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender
diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index c39c8667d013e..5592b75afb75b 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -47,7 +47,7 @@ import org.apache.spark.util.Utils
  * @tparam T partial data that can be added in
  */
 class Accumulable[R, T] private[spark] (
-    @transient initialValue: R,
+    initialValue: R,
     param: AccumulableParam[R, T],
     val name: Option[String],
     internal: Boolean)
diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index 289aab9bd9e51..7196e57d5d2e2 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -18,7 +18,7 @@
 package org.apache.spark
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
+import org.apache.spark.util.collection.ExternalAppendOnlyMap
 
 /**
  * :: DeveloperApi ::
@@ -34,59 +34,30 @@ case class Aggregator[K, V, C] (
     mergeValue: (C, V) => C,
     mergeCombiners: (C, C) => C) {
 
-  // When spilling is enabled sorting will happen externally, but not necessarily with an
-  // ExternalSorter.
-  private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
-
   @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
   def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
     combineValuesByKey(iter, null)
 
-  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
-                         context: TaskContext): Iterator[(K, C)] = {
-    if (!isSpillEnabled) {
-      val combiners = new AppendOnlyMap[K, C]
-      var kv: Product2[K, V] = null
-      val update = (hadValue: Boolean, oldValue: C) => {
-        if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
-      }
-      while (iter.hasNext) {
-        kv = iter.next()
-        combiners.changeValue(kv._1, update)
-      }
-      combiners.iterator
-    } else {
-      val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
-      combiners.insertAll(iter)
-      updateMetrics(context, combiners)
-      combiners.iterator
-    }
+  def combineValuesByKey(
+      iter: Iterator[_ <: Product2[K, V]],
+      context: TaskContext): Iterator[(K, C)] = {
+    val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
+    combiners.insertAll(iter)
+    updateMetrics(context, combiners)
+    combiners.iterator
   }
 
   @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
   def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] =
     combineCombinersByKey(iter, null)
 
-  def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
-    : Iterator[(K, C)] =
-  {
-    if (!isSpillEnabled) {
-      val combiners = new AppendOnlyMap[K, C]
-      var kc: Product2[K, C] = null
-      val update = (hadValue: Boolean, oldValue: C) => {
-        if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2
-      }
-      while (iter.hasNext) {
-        kc = iter.next()
-        combiners.changeValue(kc._1, update)
-      }
-      combiners.iterator
-    } else {
-      val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
-      combiners.insertAll(iter)
-      updateMetrics(context, combiners)
-      combiners.iterator
-    }
+  def combineCombinersByKey(
+      iter: Iterator[_ <: Product2[K, C]],
+      context: TaskContext): Iterator[(K, C)] = {
+    val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)
+    combiners.insertAll(iter)
+    updateMetrics(context, combiners)
+    combiners.iterator
   }
 
   /** Update task metrics after populating the external map. */
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
index fc8cdde9348ee..9aafc9eb1cde7 100644
--- a/core/src/main/scala/org/apache/spark/Dependency.scala
+++ b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark
 
+import scala.reflect.ClassTag
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
@@ -65,8 +67,8 @@ abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
  * @param mapSideCombine whether to perform partial aggregation (also known as map-side combine)
  */
 @DeveloperApi
-class ShuffleDependency[K, V, C](
-    @transient _rdd: RDD[_ <: Product2[K, V]],
+class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
+    @transient private val _rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
     val serializer: Option[Serializer] = None,
     val keyOrdering: Option[Ordering[K]] = None,
@@ -76,6 +78,13 @@ class ShuffleDependency[K, V, C](
 
   override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
 
+  private[spark] val keyClassName: String = reflect.classTag[K].runtimeClass.getName
+  private[spark] val valueClassName: String = reflect.classTag[V].runtimeClass.getName
+  // Note: It's possible that the combiner class tag is null, if the combineByKey
+  // methods in PairRDDFunctions are used instead of combineByKeyWithClassTag.
+  private[spark] val combinerClassName: Option[String] =
+    Option(reflect.classTag[C]).map(_.runtimeClass.getName)
+
   val shuffleId: Int = _rdd.context.newShuffleId()
 
   val shuffleHandle: ShuffleHandle = _rdd.context.env.shuffleManager.registerShuffle(
diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index ee60d697d8799..1f1f0b75de5f1 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import java.util.concurrent.{ScheduledFuture, TimeUnit}
 
 import scala.collection.mutable
+import scala.concurrent.Future
 
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext}
@@ -147,11 +148,31 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
       }
   }
 
+  /**
+   * Send ExecutorRegistered to the event loop to add a new executor. Only for test.
+   *
+   * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that
+   *         indicate if this operation is successful.
+   */
+  def addExecutor(executorId: String): Option[Future[Boolean]] = {
+    Option(self).map(_.ask[Boolean](ExecutorRegistered(executorId)))
+  }
+
   /**
    * If the heartbeat receiver is not stopped, notify it of executor registrations.
    */
   override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {
-    Option(self).foreach(_.ask[Boolean](ExecutorRegistered(executorAdded.executorId)))
+    addExecutor(executorAdded.executorId)
+  }
+
+  /**
+   * Send ExecutorRemoved to the event loop to remove a executor. Only for test.
+   *
+   * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that
+   *         indicate if this operation is successful.
+   */
+  def removeExecutor(executorId: String): Option[Future[Boolean]] = {
+    Option(self).map(_.ask[Boolean](ExecutorRemoved(executorId)))
   }
 
   /**
@@ -165,7 +186,7 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
    * and expire it with loud error messages.
    */
   override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = {
-    Option(self).foreach(_.ask[Boolean](ExecutorRemoved(executorRemoved.executorId)))
+    removeExecutor(executorRemoved.executorId)
   }
 
   private def expireDeadHosts(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala b/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
similarity index 58%
rename from core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala
rename to core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
index a4568e849fa13..f8a6f1d0d8cbb 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/MessageChunk.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
@@ -15,27 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.spark.network.nio
+package org.apache.spark
 
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-
-private[nio]
-class MessageChunk(val header: MessageChunkHeader, val buffer: ByteBuffer) {
-
-  val size: Int = if (buffer == null) 0 else buffer.remaining
-
-  lazy val buffers: ArrayBuffer[ByteBuffer] = {
-    val ab = new ArrayBuffer[ByteBuffer]()
-    ab += header.buffer
-    if (buffer != null) {
-      ab += buffer
-    }
-    ab
-  }
-
-  override def toString: String = {
-    "" + this.getClass.getSimpleName + " (id = " + header.id + ", size = " + size + ")"
-  }
-}
+/**
+ * Holds statistics about the output sizes in a map stage. May become a DeveloperApi in the future.
+ *
+ * @param shuffleId ID of the shuffle
+ * @param bytesByPartitionId approximate number of output bytes for each map output partition
+ *   (may be inexact due to use of compressed map statuses)
+ */
+private[spark] class MapOutputStatistics(val shuffleId: Int, val bytesByPartitionId: Array[Long])
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index a387592783850..e4cb72e39d7ec 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.io._
+import java.util.Arrays
 import java.util.concurrent.ConcurrentHashMap
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
@@ -132,13 +133,57 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
    *         describing the shuffle blocks that are stored at that block manager.
    */
   def getMapSizesByExecutorId(shuffleId: Int, reduceId: Int)
-  : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
-    logDebug(s"Fetching outputs for shuffle $shuffleId, reduce $reduceId")
-    val startTime = System.currentTimeMillis
+      : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
+    getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1)
+  }
 
+  /**
+   * Called from executors to get the server URIs and output sizes for each shuffle block that
+   * needs to be read from a given range of map output partitions (startPartition is included but
+   * endPartition is excluded from the range).
+   *
+   * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
+   *         and the second item is a sequence of (shuffle block id, shuffle block size) tuples
+   *         describing the shuffle blocks that are stored at that block manager.
+   */
+  def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int)
+      : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
+    logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition")
+    val statuses = getStatuses(shuffleId)
+    // Synchronize on the returned array because, on the driver, it gets mutated in place
+    statuses.synchronized {
+      return MapOutputTracker.convertMapStatuses(shuffleId, startPartition, endPartition, statuses)
+    }
+  }
+
+  /**
+   * Return statistics about all of the outputs for a given shuffle.
+   */
+  def getStatistics(dep: ShuffleDependency[_, _, _]): MapOutputStatistics = {
+    val statuses = getStatuses(dep.shuffleId)
+    // Synchronize on the returned array because, on the driver, it gets mutated in place
+    statuses.synchronized {
+      val totalSizes = new Array[Long](dep.partitioner.numPartitions)
+      for (s <- statuses) {
+        for (i <- 0 until totalSizes.length) {
+          totalSizes(i) += s.getSizeForBlock(i)
+        }
+      }
+      new MapOutputStatistics(dep.shuffleId, totalSizes)
+    }
+  }
+
+  /**
+   * Get or fetch the array of MapStatuses for a given shuffle ID. NOTE: clients MUST synchronize
+   * on this array when reading it, because on the driver, we may be changing it in place.
+   *
+   * (It would be nice to remove this restriction in the future.)
+   */
+  private def getStatuses(shuffleId: Int): Array[MapStatus] = {
     val statuses = mapStatuses.get(shuffleId).orNull
     if (statuses == null) {
       logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them")
+      val startTime = System.currentTimeMillis
       var fetchedStatuses: Array[MapStatus] = null
       fetching.synchronized {
         // Someone else is fetching it; wait for them to be done
@@ -160,7 +205,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
       }
 
       if (fetchedStatuses == null) {
-        // We won the race to fetch the output locs; do so
+        // We won the race to fetch the statuses; do so
         logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint)
         // This try-finally prevents hangs due to timeouts:
         try {
@@ -175,22 +220,18 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
           }
         }
       }
-      logDebug(s"Fetching map output location for shuffle $shuffleId, reduce $reduceId took " +
+      logDebug(s"Fetching map output statuses for shuffle $shuffleId took " +
         s"${System.currentTimeMillis - startTime} ms")
 
       if (fetchedStatuses != null) {
-        fetchedStatuses.synchronized {
-          return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, fetchedStatuses)
-        }
+        return fetchedStatuses
       } else {
         logError("Missing all output locations for shuffle " + shuffleId)
         throw new MetadataFetchFailedException(
-          shuffleId, reduceId, "Missing all output locations for shuffle " + shuffleId)
+          shuffleId, -1, "Missing all output locations for shuffle " + shuffleId)
       }
     } else {
-      statuses.synchronized {
-        return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, statuses)
-      }
+      return statuses
     }
   }
 
@@ -235,6 +276,21 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
   /** Cache a serialized version of the output statuses for each shuffle to send them out faster */
   private var cacheEpoch = epoch
 
+  /** Whether to compute locality preferences for reduce tasks */
+  private val shuffleLocalityEnabled = conf.getBoolean("spark.shuffle.reduceLocality.enabled", true)
+
+  // Number of map and reduce tasks above which we do not assign preferred locations based on map
+  // output sizes. We limit the size of jobs for which assign preferred locations as computing the
+  // top locations by size becomes expensive.
+  private val SHUFFLE_PREF_MAP_THRESHOLD = 1000
+  // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that
+  private val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
+
+  // Fraction of total map output that must be at a location for it to considered as a preferred
+  // location for a reduce task. Making this larger will focus on fewer locations where most data
+  // can be read locally, but may lead to more delay in scheduling if those locations are busy.
+  private val REDUCER_PREF_LOCS_FRACTION = 0.2
+
   /**
    * Timestamp based HashMap for storing mapStatuses and cached serialized statuses in the driver,
    * so that statuses are dropped only by explicit de-registering or by TTL-based cleaning (if set).
@@ -295,6 +351,30 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
     cachedSerializedStatuses.contains(shuffleId) || mapStatuses.contains(shuffleId)
   }
 
+  /**
+   * Return the preferred hosts on which to run the given map output partition in a given shuffle,
+   * i.e. the nodes that the most outputs for that partition are on.
+   *
+   * @param dep shuffle dependency object
+   * @param partitionId map output partition that we want to read
+   * @return a sequence of host names
+   */
+  def getPreferredLocationsForShuffle(dep: ShuffleDependency[_, _, _], partitionId: Int)
+      : Seq[String] = {
+    if (shuffleLocalityEnabled && dep.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD &&
+        dep.partitioner.numPartitions < SHUFFLE_PREF_REDUCE_THRESHOLD) {
+      val blockManagerIds = getLocationsWithLargestOutputs(dep.shuffleId, partitionId,
+        dep.partitioner.numPartitions, REDUCER_PREF_LOCS_FRACTION)
+      if (blockManagerIds.nonEmpty) {
+        blockManagerIds.get.map(_.host)
+      } else {
+        Nil
+      }
+    } else {
+      Nil
+    }
+  }
+
   /**
    * Return a list of locations that each have fraction of map output greater than the specified
    * threshold.
@@ -433,23 +513,25 @@ private[spark] object MapOutputTracker extends Logging {
   }
 
   /**
-   * Converts an array of MapStatuses for a given reduce ID to a sequence that, for each block
-   * manager ID, lists the shuffle block ids and corresponding shuffle block sizes stored at that
-   * block manager.
+   * Given an array of map statuses and a range of map output partitions, returns a sequence that,
+   * for each block manager ID, lists the shuffle block IDs and corresponding shuffle block sizes
+   * stored at that block manager.
    *
    * If any of the statuses is null (indicating a missing location due to a failed mapper),
    * throws a FetchFailedException.
    *
    * @param shuffleId Identifier for the shuffle
-   * @param reduceId Identifier for the reduce task
+   * @param startPartition Start of map output partition ID range (included in range)
+   * @param endPartition End of map output partition ID range (excluded from range)
    * @param statuses List of map statuses, indexed by map ID.
    * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
-   *         and the second item is a sequence of (shuffle block id, shuffle block size) tuples
+   *         and the second item is a sequence of (shuffle block ID, shuffle block size) tuples
    *         describing the shuffle blocks that are stored at that block manager.
    */
   private def convertMapStatuses(
       shuffleId: Int,
-      reduceId: Int,
+      startPartition: Int,
+      endPartition: Int,
       statuses: Array[MapStatus]): Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
     assert (statuses != null)
     val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(BlockId, Long)]]
@@ -457,10 +539,12 @@ private[spark] object MapOutputTracker extends Logging {
       if (status == null) {
         val errorMessage = s"Missing an output location for shuffle $shuffleId"
         logError(errorMessage)
-        throw new MetadataFetchFailedException(shuffleId, reduceId, errorMessage)
+        throw new MetadataFetchFailedException(shuffleId, startPartition, errorMessage)
       } else {
-        splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) +=
-          ((ShuffleBlockId(shuffleId, mapId, reduceId), status.getSizeForBlock(reduceId)))
+        for (part <- startPartition until endPartition) {
+          splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) +=
+            ((ShuffleBlockId(shuffleId, mapId, part), status.getSizeForBlock(part)))
+        }
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 4b9d59975bdc2..e4df7af81a6d2 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -104,8 +104,8 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  * the value of `partitions`.
  */
 class RangePartitioner[K : Ordering : ClassTag, V](
-    @transient partitions: Int,
-    @transient rdd: RDD[_ <: Product2[K, V]],
+    partitions: Int,
+    rdd: RDD[_ <: Product2[K, V]],
     private var ascending: Boolean = true)
   extends Partitioner {
 
@@ -291,7 +291,7 @@ private[spark] object RangePartitioner {
     while ((i < numCandidates) && (j < partitions - 1)) {
       val (key, weight) = ordered(i)
       cumWeight += weight
-      if (cumWeight > target) {
+      if (cumWeight >= target) {
         // Skip duplicate values.
         if (previousBound.isEmpty || ordering.gt(key, previousBound.get)) {
           bounds += key
diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 673ef49e7c1c5..746d2081d4393 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -310,7 +310,16 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
     setViewAcls(Set[String](defaultUser), allowedUsers)
   }
 
-  def getViewAcls: String = viewAcls.mkString(",")
+  /**
+   * Checking the existence of "*" is necessary as YARN can't recognize the "*" in "defaultuser,*"
+   */
+  def getViewAcls: String = {
+    if (viewAcls.contains("*")) {
+      "*"
+    } else {
+      viewAcls.mkString(",")
+    }
+  }
 
   /**
    * Admin acls should be set before the view or modify acls.  If you modify the admin
@@ -321,7 +330,16 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
     logInfo("Changing modify acls to: " + modifyAcls.mkString(","))
   }
 
-  def getModifyAcls: String = modifyAcls.mkString(",")
+  /**
+   * Checking the existence of "*" is necessary as YARN can't recognize the "*" in "defaultuser,*"
+   */
+  def getModifyAcls: String = {
+    if (modifyAcls.contains("*")) {
+      "*"
+    } else {
+      modifyAcls.mkString(",")
+    }
+  }
 
   /**
    * Admin acls should be set before the view or modify acls.  If you modify the admin
@@ -394,7 +412,7 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
   def checkUIViewPermissions(user: String): Boolean = {
     logDebug("user=" + user + " aclsEnabled=" + aclsEnabled() + " viewAcls=" +
       viewAcls.mkString(","))
-    !aclsEnabled || user == null || viewAcls.contains(user)
+    !aclsEnabled || user == null || viewAcls.contains(user) || viewAcls.contains("*")
   }
 
   /**
@@ -409,7 +427,7 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
   def checkModifyPermissions(user: String): Boolean = {
     logDebug("user=" + user + " aclsEnabled=" + aclsEnabled() + " modifyAcls=" +
       modifyAcls.mkString(","))
-    !aclsEnabled || user == null || modifyAcls.contains(user)
+    !aclsEnabled || user == null || modifyAcls.contains(user) || modifyAcls.contains("*")
   }
 
 
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 738887076b0d1..0c72adfb9505b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -33,6 +33,7 @@ import scala.collection.mutable.HashMap
 import scala.reflect.{ClassTag, classTag}
 import scala.util.control.NonFatal
 
+import org.apache.commons.lang.SerializationUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable,
@@ -96,7 +97,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   val startTime = System.currentTimeMillis()
 
-  private val stopped: AtomicBoolean = new AtomicBoolean(false)
+  private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false)
 
   private def assertNotStopped(): Unit = {
     if (stopped.get()) {
@@ -265,6 +266,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   def isLocal: Boolean = (master == "local" || master.startsWith("local["))
 
+  /**
+   * @return true if context is stopped or in the midst of stopping.
+   */
+  def isStopped: Boolean = stopped.get()
+
   // An asynchronous listener bus for Spark events
   private[spark] val listenerBus = new LiveListenerBus
 
@@ -347,8 +353,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private[spark] var checkpointDir: Option[String] = None
 
   // Thread Local variable that can be used by users to pass information down the stack
-  private val localProperties = new InheritableThreadLocal[Properties] {
-    override protected def childValue(parent: Properties): Properties = new Properties(parent)
+  protected[spark] val localProperties = new InheritableThreadLocal[Properties] {
+    override protected def childValue(parent: Properties): Properties = {
+      // Note: make a clone such that changes in the parent properties aren't reflected in
+      // the those of the children threads, which has confusing semantics (SPARK-10563).
+      SerializationUtils.clone(parent).asInstanceOf[Properties]
+    }
     override protected def initialValue(): Properties = new Properties()
   }
 
@@ -516,6 +526,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     _applicationId = _taskScheduler.applicationId()
     _applicationAttemptId = taskScheduler.applicationAttemptId()
     _conf.set("spark.app.id", _applicationId)
+    _ui.foreach(_.setAppId(_applicationId))
     _env.blockManager.initialize(_applicationId)
 
     // The metrics system for Driver need to be set spark.app.id to app ID.
@@ -858,7 +869,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Use setInputPaths so that wholeTextFiles aligns with hadoopFile/textFile in taking
     // comma separated files as input. (see SPARK-7155)
     NewFileInputFormat.setInputPaths(job, path)
-    val updateConf = job.getConfiguration
+    val updateConf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     new WholeTextFileRDD(
       this,
       classOf[WholeTextFileInputFormat],
@@ -910,7 +921,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Use setInputPaths so that binaryFiles aligns with hadoopFile/textFile in taking
     // comma separated files as input. (see SPARK-7155)
     NewFileInputFormat.setInputPaths(job, path)
-    val updateConf = job.getConfiguration
+    val updateConf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     new BinaryFileRDD(
       this,
       classOf[StreamInputFormat],
@@ -1092,7 +1103,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     // Use setInputPaths so that newAPIHadoopFile aligns with hadoopFile/textFile in taking
     // comma separated files as input. (see SPARK-7155)
     NewFileInputFormat.setInputPaths(job, path)
-    val updatedConf = job.getConfiguration
+    val updatedConf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf).setName(path)
   }
 
@@ -1516,8 +1527,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    */
   @DeveloperApi
   def getRDDStorageInfo: Array[RDDInfo] = {
+    getRDDStorageInfo(_ => true)
+  }
+
+  private[spark] def getRDDStorageInfo(filter: RDD[_] => Boolean): Array[RDDInfo] = {
     assertNotStopped()
-    val rddInfos = persistentRdds.values.map(RDDInfo.fromRdd).toArray
+    val rddInfos = persistentRdds.values.filter(filter).map(RDDInfo.fromRdd).toArray
     StorageUtils.updateRddInfo(rddInfos, getExecutorStorageStatus)
     rddInfos.filter(_.isCached)
   }
@@ -1741,6 +1756,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       }
       SparkEnv.set(null)
     }
+    // Unset YARN mode system env variable, to allow switching between cluster types.
+    System.clearProperty("SPARK_YARN_MODE")
     SparkContext.clearActiveContext()
     logInfo("Successfully stopped SparkContext")
   }
@@ -1980,6 +1997,23 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     new SimpleFutureAction(waiter, resultFunc)
   }
 
+  /**
+   * Submit a map stage for execution. This is currently an internal API only, but might be
+   * promoted to DeveloperApi in the future.
+   */
+  private[spark] def submitMapStage[K, V, C](dependency: ShuffleDependency[K, V, C])
+      : SimpleFutureAction[MapOutputStatistics] = {
+    assertNotStopped()
+    val callSite = getCallSite()
+    var result: MapOutputStatistics = null
+    val waiter = dagScheduler.submitMapStage(
+      dependency,
+      (r: MapOutputStatistics) => { result = r },
+      callSite,
+      localProperties.get)
+    new SimpleFutureAction[MapOutputStatistics](waiter, result)
+  }
+
   /**
    * Cancel active jobs for the specified group. See [[org.apache.spark.SparkContext.setJobGroup]]
    * for more information.
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 0f1e2e069568d..c6fef7f91f00c 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -33,7 +33,6 @@ import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
-import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEndpoint, RpcEnv}
 import org.apache.spark.rpc.akka.AkkaRpcEnv
 import org.apache.spark.scheduler.{OutputCommitCoordinator, LiveListenerBus}
@@ -326,15 +325,7 @@ object SparkEnv extends Logging {
 
     val shuffleMemoryManager = ShuffleMemoryManager.create(conf, numUsableCores)
 
-    val blockTransferService =
-      conf.get("spark.shuffle.blockTransferService", "netty").toLowerCase match {
-        case "netty" =>
-          new NettyBlockTransferService(conf, securityManager, numUsableCores)
-        case "nio" =>
-          logWarning("NIO-based block transfer service is deprecated, " +
-            "and will be removed in Spark 1.6.0.")
-          new NioBlockTransferService(conf, securityManager)
-      }
+    val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
 
     val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(
       BlockManagerMaster.DRIVER_ENDPOINT_NAME,
diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index f5dd36cbcfe6d..ac6eaab20d8d2 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -37,7 +37,7 @@ import org.apache.spark.util.SerializableJobConf
  * a filename to write to, etc, exactly like in a Hadoop MapReduce job.
  */
 private[spark]
-class SparkHadoopWriter(@transient jobConf: JobConf)
+class SparkHadoopWriter(jobConf: JobConf)
   extends Logging
   with SparkHadoopMapRedUtil
   with Serializable {
@@ -104,8 +104,7 @@ class SparkHadoopWriter(@transient jobConf: JobConf)
   }
 
   def commit() {
-    SparkHadoopMapRedUtil.commitTask(
-      getOutputCommitter(), getTaskContext(), jobID, splitID, attemptID)
+    SparkHadoopMapRedUtil.commitTask(getOutputCommitter(), getTaskContext(), jobID, splitID)
   }
 
   def commitJob() {
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 934d00dc708b9..9335c5f4160bf 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -17,13 +17,17 @@
 
 package org.apache.spark
 
-import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
+import java.io.{ObjectInputStream, ObjectOutputStream}
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.Utils
 
+// ==============================================================================================
+// NOTE: new task end reasons MUST be accompanied with serialization logic in util.JsonProtocol!
+// ==============================================================================================
+
 /**
  * :: DeveloperApi ::
  * Various possible reasons why a task ended. The low-level TaskScheduler is supposed to retry
@@ -48,6 +52,8 @@ case object Success extends TaskEndReason
 sealed trait TaskFailedReason extends TaskEndReason {
   /** Error message displayed in the web UI. */
   def toErrorString: String
+
+  def shouldEventuallyFailJob: Boolean = true
 }
 
 /**
@@ -191,9 +197,18 @@ case object TaskKilled extends TaskFailedReason {
  * Task requested the driver to commit, but was denied.
  */
 @DeveloperApi
-case class TaskCommitDenied(jobID: Int, partitionID: Int, attemptID: Int) extends TaskFailedReason {
+case class TaskCommitDenied(
+    jobID: Int,
+    partitionID: Int,
+    attemptNumber: Int) extends TaskFailedReason {
   override def toErrorString: String = s"TaskCommitDenied (Driver denied task commit)" +
-    s" for job: $jobID, partition: $partitionID, attempt: $attemptID"
+    s" for job: $jobID, partition: $partitionID, attemptNumber: $attemptNumber"
+  /**
+   * If a task failed because its attempt to commit was denied, do not count this failure
+   * towards failing the stage. This is intended to prevent spurious stage failures in cases
+   * where many speculative tasks are launched and denied to commit.
+   */
+  override def shouldEventuallyFailJob: Boolean = false
 }
 
 /**
@@ -202,8 +217,14 @@ case class TaskCommitDenied(jobID: Int, partitionID: Int, attemptID: Int) extend
  * the task crashed the JVM.
  */
 @DeveloperApi
-case class ExecutorLostFailure(execId: String) extends TaskFailedReason {
-  override def toErrorString: String = s"ExecutorLostFailure (executor ${execId} lost)"
+case class ExecutorLostFailure(execId: String, isNormalExit: Boolean = false)
+  extends TaskFailedReason {
+  override def toErrorString: String = {
+    val exitBehavior = if (isNormalExit) "normally" else "abnormally"
+    s"ExecutorLostFailure (executor ${execId} exited ${exitBehavior})"
+  }
+
+  override def shouldEventuallyFailJob: Boolean = !isNormalExit
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index fb787979c1820..8344f6368ac48 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -239,7 +239,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
       mapSideCombine: Boolean,
       serializer: Serializer): JavaPairRDD[K, C] = {
       implicit val ctag: ClassTag[C] = fakeClassTag
-    fromRDD(rdd.combineByKey(
+    fromRDD(rdd.combineByKeyWithClassTag(
       createCombiner,
       mergeValue,
       mergeCombiners,
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b4d152b336602..8464b578ed09e 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -24,6 +24,7 @@ import java.util.{Collections, ArrayList => JArrayList, List => JList, Map => JM
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.language.existentials
+import scala.util.control.NonFatal
 
 import com.google.common.base.Charsets.UTF_8
 import org.apache.hadoop.conf.Configuration
@@ -38,10 +39,9 @@ import org.apache.spark.input.PortableDataStream
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
-import scala.util.control.NonFatal
 
 private[spark] class PythonRDD(
-    @transient parent: RDD[_],
+    parent: RDD[_],
     command: Array[Byte],
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
@@ -61,11 +61,39 @@ private[spark] class PythonRDD(
     if (preservePartitoning) firstParent.partitioner else None
   }
 
+  val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
+
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
+    val runner = new PythonRunner(
+      command, envVars, pythonIncludes, pythonExec, pythonVer, broadcastVars, accumulator,
+      bufferSize, reuse_worker)
+    runner.compute(firstParent.iterator(split, context), split.index, context)
+  }
+}
+
+
+/**
+ * A helper class to run Python UDFs in Spark.
+ */
+private[spark] class PythonRunner(
+    command: Array[Byte],
+    envVars: JMap[String, String],
+    pythonIncludes: JList[String],
+    pythonExec: String,
+    pythonVer: String,
+    broadcastVars: JList[Broadcast[PythonBroadcast]],
+    accumulator: Accumulator[JList[Array[Byte]]],
+    bufferSize: Int,
+    reuse_worker: Boolean)
+  extends Logging {
+
+  def compute(
+      inputIterator: Iterator[_],
+      partitionIndex: Int,
+      context: TaskContext): Iterator[Array[Byte]] = {
     val startTime = System.currentTimeMillis
     val env = SparkEnv.get
-    val localdir = env.blockManager.diskBlockManager.localDirs.map(
-      f => f.getPath()).mkString(",")
+    val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",")
     envVars.put("SPARK_LOCAL_DIRS", localdir) // it's also used in monitor thread
     if (reuse_worker) {
       envVars.put("SPARK_REUSE_WORKER", "1")
@@ -75,7 +103,7 @@ private[spark] class PythonRDD(
     @volatile var released = false
 
     // Start a thread to feed the process input from our parent's iterator
-    val writerThread = new WriterThread(env, worker, split, context)
+    val writerThread = new WriterThread(env, worker, inputIterator, partitionIndex, context)
 
     context.addTaskCompletionListener { context =>
       writerThread.shutdownOnTaskCompletion()
@@ -183,13 +211,16 @@ private[spark] class PythonRDD(
     new InterruptibleIterator(context, stdoutIterator)
   }
 
-  val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
-
   /**
    * The thread responsible for writing the data from the PythonRDD's parent iterator to the
    * Python process.
    */
-  class WriterThread(env: SparkEnv, worker: Socket, split: Partition, context: TaskContext)
+  class WriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[_],
+      partitionIndex: Int,
+      context: TaskContext)
     extends Thread(s"stdout writer for $pythonExec") {
 
     @volatile private var _exception: Exception = null
@@ -211,11 +242,11 @@ private[spark] class PythonRDD(
         val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
         val dataOut = new DataOutputStream(stream)
         // Partition index
-        dataOut.writeInt(split.index)
+        dataOut.writeInt(partitionIndex)
         // Python version of driver
         PythonRDD.writeUTF(pythonVer, dataOut)
         // sparkFilesDir
-        PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
+        PythonRDD.writeUTF(SparkFiles.getRootDirectory(), dataOut)
         // Python includes (*.zip and *.egg files)
         dataOut.writeInt(pythonIncludes.size())
         for (include <- pythonIncludes.asScala) {
@@ -246,7 +277,7 @@ private[spark] class PythonRDD(
         dataOut.writeInt(command.length)
         dataOut.write(command)
         // Data values
-        PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
+        PythonRDD.writeIteratorToStream(inputIterator, dataOut)
         dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
         dataOut.writeInt(SpecialLengths.END_OF_STREAM)
         dataOut.flush()
@@ -327,7 +358,8 @@ private[spark] object PythonRDD extends Logging {
 
   // remember the broadcasts sent to each worker
   private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]()
-  private def getWorkerBroadcasts(worker: Socket) = {
+
+  def getWorkerBroadcasts(worker: Socket): mutable.Set[Long] = {
     synchronized {
       workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]())
     }
@@ -601,7 +633,7 @@ private[spark] object PythonRDD extends Logging {
    *
    * The thread will terminate after all the data are sent or any exceptions happen.
    */
-  private def serveIterator[T](items: Iterator[T], threadName: String): Int = {
+  def serveIterator[T](items: Iterator[T], threadName: String): Int = {
     val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
     // Close the socket if no connection in 3 seconds
     serverSocket.setSoTimeout(3000)
@@ -785,7 +817,7 @@ class BytesToString extends org.apache.spark.api.java.function.Function[Array[By
  * Internal class that acts as an `AccumulatorParam` for Python accumulators. Inside, it
  * collects a list of pickled strings that we pass to Python through a socket.
  */
-private class PythonAccumulatorParam(@transient serverHost: String, serverPort: Int)
+private class PythonAccumulatorParam(@transient private val serverHost: String, serverPort: Int)
   extends AccumulatorParam[JList[Array[Byte]]] {
 
   Utils.checkHost(serverHost, "Expected hostname")
@@ -839,7 +871,8 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
  * write the data into disk after deserialization, then Python can read it from disks.
  */
 // scalastyle:off no.finalize
-private[spark] class PythonBroadcast(@transient var path: String) extends Serializable {
+private[spark] class PythonBroadcast(@transient var path: String) extends Serializable
+  with Logging {
 
   /**
    * Read data from disks, then copy it to `out`
@@ -875,7 +908,9 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial
     if (!path.isEmpty) {
       val file = new File(path)
       if (file.exists()) {
-        file.delete()
+        if (!file.delete()) {
+          logWarning(s"Error deleting ${file.getPath}")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index bb82f3285f1d9..2a792d81994fd 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -125,10 +125,11 @@ private[r] class RBackendHandler(server: RBackend)
       val methods = cls.getMethods
       val selectedMethods = methods.filter(m => m.getName == methodName)
       if (selectedMethods.length > 0) {
-        val methods = selectedMethods.filter { x =>
-          matchMethod(numArgs, args, x.getParameterTypes)
-        }
-        if (methods.isEmpty) {
+        val index = findMatchedSignature(
+          selectedMethods.map(_.getParameterTypes),
+          args)
+
+        if (index.isEmpty) {
           logWarning(s"cannot find matching method ${cls}.$methodName. "
             + s"Candidates are:")
           selectedMethods.foreach { method =>
@@ -136,18 +137,29 @@ private[r] class RBackendHandler(server: RBackend)
           }
           throw new Exception(s"No matched method found for $cls.$methodName")
         }
-        val ret = methods.head.invoke(obj, args : _*)
+
+        val ret = selectedMethods(index.get).invoke(obj, args : _*)
 
         // Write status bit
         writeInt(dos, 0)
         writeObject(dos, ret.asInstanceOf[AnyRef])
       } else if (methodName == "<init>") {
         // methodName should be "<init>" for constructor
-        val ctor = cls.getConstructors.filter { x =>
-          matchMethod(numArgs, args, x.getParameterTypes)
-        }.head
+        val ctors = cls.getConstructors
+        val index = findMatchedSignature(
+          ctors.map(_.getParameterTypes),
+          args)
 
-        val obj = ctor.newInstance(args : _*)
+        if (index.isEmpty) {
+          logWarning(s"cannot find matching constructor for ${cls}. "
+            + s"Candidates are:")
+          ctors.foreach { ctor =>
+            logWarning(s"$cls(${ctor.getParameterTypes.mkString(",")})")
+          }
+          throw new Exception(s"No matched constructor found for $cls")
+        }
+
+        val obj = ctors(index.get).newInstance(args : _*)
 
         writeInt(dos, 0)
         writeObject(dos, obj.asInstanceOf[AnyRef])
@@ -166,40 +178,79 @@ private[r] class RBackendHandler(server: RBackend)
 
   // Read a number of arguments from the data input stream
   def readArgs(numArgs: Int, dis: DataInputStream): Array[java.lang.Object] = {
-    (0 until numArgs).map { arg =>
+    (0 until numArgs).map { _ =>
       readObject(dis)
     }.toArray
   }
 
-  // Checks if the arguments passed in args matches the parameter types.
-  // NOTE: Currently we do exact match. We may add type conversions later.
-  def matchMethod(
-      numArgs: Int,
-      args: Array[java.lang.Object],
-      parameterTypes: Array[Class[_]]): Boolean = {
-    if (parameterTypes.length != numArgs) {
-      return false
-    }
+  // Find a matching method signature in an array of signatures of constructors
+  // or methods of the same name according to the passed arguments. Arguments
+  // may be converted in order to match a signature.
+  //
+  // Note that in Java reflection, constructors and normal methods are of different
+  // classes, and share no parent class that provides methods for reflection uses.
+  // There is no unified way to handle them in this function. So an array of signatures
+  // is passed in instead of an array of candidate constructors or methods.
+  //
+  // Returns an Option[Int] which is the index of the matched signature in the array.
+  def findMatchedSignature(
+      parameterTypesOfMethods: Array[Array[Class[_]]],
+      args: Array[Object]): Option[Int] = {
+    val numArgs = args.length
+
+    for (index <- 0 until parameterTypesOfMethods.length) {
+      val parameterTypes = parameterTypesOfMethods(index)
+
+      if (parameterTypes.length == numArgs) {
+        var argMatched = true
+        var i = 0
+        while (i < numArgs && argMatched) {
+          val parameterType = parameterTypes(i)
+
+          if (parameterType == classOf[Seq[Any]] && args(i).getClass.isArray) {
+            // The case that the parameter type is a Scala Seq and the argument
+            // is a Java array is considered matching. The array will be converted
+            // to a Seq later if this method is matched.
+          } else {
+            var parameterWrapperType = parameterType
+
+            // Convert native parameters to Object types as args is Array[Object] here
+            if (parameterType.isPrimitive) {
+              parameterWrapperType = parameterType match {
+                case java.lang.Integer.TYPE => classOf[java.lang.Integer]
+                case java.lang.Long.TYPE => classOf[java.lang.Integer]
+                case java.lang.Double.TYPE => classOf[java.lang.Double]
+                case java.lang.Boolean.TYPE => classOf[java.lang.Boolean]
+                case _ => parameterType
+              }
+            }
+            if (!parameterWrapperType.isInstance(args(i))) {
+              argMatched = false
+            }
+          }
 
-    for (i <- 0 to numArgs - 1) {
-      val parameterType = parameterTypes(i)
-      var parameterWrapperType = parameterType
-
-      // Convert native parameters to Object types as args is Array[Object] here
-      if (parameterType.isPrimitive) {
-        parameterWrapperType = parameterType match {
-          case java.lang.Integer.TYPE => classOf[java.lang.Integer]
-          case java.lang.Long.TYPE => classOf[java.lang.Integer]
-          case java.lang.Double.TYPE => classOf[java.lang.Double]
-          case java.lang.Boolean.TYPE => classOf[java.lang.Boolean]
-          case _ => parameterType
+          i = i + 1
+        }
+
+        if (argMatched) {
+          // For now, we return the first matching method.
+          // TODO: find best method in matching methods.
+
+          // Convert args if needed
+          val parameterTypes = parameterTypesOfMethods(index)
+
+          (0 until numArgs).map { i =>
+            if (parameterTypes(i) == classOf[Seq[Any]] && args(i).getClass.isArray) {
+              // Convert a Java array to scala Seq
+              args(i) = args(i).asInstanceOf[Array[_]].toSeq
+            }
+          }
+
+          return Some(index)
         }
-      }
-      if (!parameterWrapperType.isInstance(args(i))) {
-        return false
       }
     }
-    true
+    None
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
index 9e807cc52f18c..fd5646b5b6372 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
@@ -44,7 +44,7 @@ private[spark] object RUtils {
         (sys.props("spark.master"), sys.props("spark.submit.deployMode"))
       } else {
         val sparkConf = SparkEnv.get.conf
-        (sparkConf.get("spark.master"), sparkConf.get("spark.submit.deployMode"))
+        (sparkConf.get("spark.master"), sparkConf.get("spark.submit.deployMode", "client"))
       }
 
     val isYarnCluster = master != null && master.contains("yarn") && deployMode == "cluster"
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 26ad4f1d4697e..0c78613e406e1 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -21,6 +21,7 @@ import java.io.{DataInputStream, DataOutputStream}
 import java.sql.{Timestamp, Date, Time}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.WrappedArray
 
 /**
  * Utility functions to serialize, deserialize objects to / from R
@@ -208,94 +209,133 @@ private[spark] object SerDe {
       case "array" => dos.writeByte('a')
       // Array of objects
       case "list" => dos.writeByte('l')
+      case "map" => dos.writeByte('e')
       case "jobj" => dos.writeByte('j')
       case _ => throw new IllegalArgumentException(s"Invalid type $typeStr")
     }
   }
 
-  def writeObject(dos: DataOutputStream, value: Object): Unit = {
-    if (value == null) {
+  private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = {
+    if (key == null) {
+      throw new IllegalArgumentException("Key in map can't be null.")
+    } else if (!key.isInstanceOf[String]) {
+      throw new IllegalArgumentException(s"Invalid map key type: ${key.getClass.getName}")
+    }
+
+    writeString(dos, key.asInstanceOf[String])
+    writeObject(dos, value)
+  }
+
+  def writeObject(dos: DataOutputStream, obj: Object): Unit = {
+    if (obj == null) {
       writeType(dos, "void")
     } else {
-      value.getClass.getName match {
-        case "java.lang.Character" =>
+      // Convert ArrayType collected from DataFrame to Java array
+      // Collected data of ArrayType from a DataFrame is observed to be of
+      // type "scala.collection.mutable.WrappedArray"
+      val value =
+        if (obj.isInstanceOf[WrappedArray[_]]) {
+          obj.asInstanceOf[WrappedArray[_]].toArray
+        } else {
+          obj
+        }
+
+      value match {
+        case v: java.lang.Character =>
           writeType(dos, "character")
-          writeString(dos, value.asInstanceOf[Character].toString)
-        case "java.lang.String" =>
+          writeString(dos, v.toString)
+        case v: java.lang.String =>
           writeType(dos, "character")
-          writeString(dos, value.asInstanceOf[String])
-        case "java.lang.Long" =>
+          writeString(dos, v)
+        case v: java.lang.Long =>
           writeType(dos, "double")
-          writeDouble(dos, value.asInstanceOf[Long].toDouble)
-        case "java.lang.Float" =>
+          writeDouble(dos, v.toDouble)
+        case v: java.lang.Float =>
           writeType(dos, "double")
-          writeDouble(dos, value.asInstanceOf[Float].toDouble)
-        case "java.math.BigDecimal" =>
+          writeDouble(dos, v.toDouble)
+        case v: java.math.BigDecimal =>
           writeType(dos, "double")
-          val javaDecimal = value.asInstanceOf[java.math.BigDecimal]
-          writeDouble(dos, scala.math.BigDecimal(javaDecimal).toDouble)
-        case "java.lang.Double" =>
+          writeDouble(dos, scala.math.BigDecimal(v).toDouble)
+        case v: java.lang.Double =>
           writeType(dos, "double")
-          writeDouble(dos, value.asInstanceOf[Double])
-        case "java.lang.Byte" =>
+          writeDouble(dos, v)
+        case v: java.lang.Byte =>
           writeType(dos, "integer")
-          writeInt(dos, value.asInstanceOf[Byte].toInt)
-        case "java.lang.Short" =>
+          writeInt(dos, v.toInt)
+        case v: java.lang.Short =>
           writeType(dos, "integer")
-          writeInt(dos, value.asInstanceOf[Short].toInt)
-        case "java.lang.Integer" =>
+          writeInt(dos, v.toInt)
+        case v: java.lang.Integer =>
           writeType(dos, "integer")
-          writeInt(dos, value.asInstanceOf[Int])
-        case "java.lang.Boolean" =>
+          writeInt(dos, v)
+        case v: java.lang.Boolean =>
           writeType(dos, "logical")
-          writeBoolean(dos, value.asInstanceOf[Boolean])
-        case "java.sql.Date" =>
+          writeBoolean(dos, v)
+        case v: java.sql.Date =>
           writeType(dos, "date")
-          writeDate(dos, value.asInstanceOf[Date])
-        case "java.sql.Time" =>
+          writeDate(dos, v)
+        case v: java.sql.Time =>
           writeType(dos, "time")
-          writeTime(dos, value.asInstanceOf[Time])
-        case "java.sql.Timestamp" =>
+          writeTime(dos, v)
+        case v: java.sql.Timestamp =>
           writeType(dos, "time")
-          writeTime(dos, value.asInstanceOf[Timestamp])
+          writeTime(dos, v)
 
         // Handle arrays
 
         // Array of primitive types
 
         // Special handling for byte array
-        case "[B" =>
+        case v: Array[Byte] =>
           writeType(dos, "raw")
-          writeBytes(dos, value.asInstanceOf[Array[Byte]])
+          writeBytes(dos, v)
 
-        case "[C" =>
+        case v: Array[Char] =>
           writeType(dos, "array")
-          writeStringArr(dos, value.asInstanceOf[Array[Char]].map(_.toString))
-        case "[S" =>
+          writeStringArr(dos, v.map(_.toString))
+        case v: Array[Short] =>
           writeType(dos, "array")
-          writeIntArr(dos, value.asInstanceOf[Array[Short]].map(_.toInt))
-        case "[I" =>
+          writeIntArr(dos, v.map(_.toInt))
+        case v: Array[Int] =>
           writeType(dos, "array")
-          writeIntArr(dos, value.asInstanceOf[Array[Int]])
-        case "[J" =>
+          writeIntArr(dos, v)
+        case v: Array[Long] =>
           writeType(dos, "array")
-          writeDoubleArr(dos, value.asInstanceOf[Array[Long]].map(_.toDouble))
-        case "[F" =>
+          writeDoubleArr(dos, v.map(_.toDouble))
+        case v: Array[Float] =>
           writeType(dos, "array")
-          writeDoubleArr(dos, value.asInstanceOf[Array[Float]].map(_.toDouble))
-        case "[D" =>
+          writeDoubleArr(dos, v.map(_.toDouble))
+        case v: Array[Double] =>
           writeType(dos, "array")
-          writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
-        case "[Z" =>
+          writeDoubleArr(dos, v)
+        case v: Array[Boolean] =>
           writeType(dos, "array")
-          writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
+          writeBooleanArr(dos, v)
 
         // Array of objects, null objects use "void" type
-        case c if c.startsWith("[") =>
+        case v: Array[Object] =>
           writeType(dos, "list")
-          val array = value.asInstanceOf[Array[Object]]
-          writeInt(dos, array.length)
-          array.foreach(elem => writeObject(dos, elem))
+          writeInt(dos, v.length)
+          v.foreach(elem => writeObject(dos, elem))
+
+        // Handle map
+        case v: java.util.Map[_, _] =>
+          writeType(dos, "map")
+          writeInt(dos, v.size)
+          val iter = v.entrySet.iterator
+          while(iter.hasNext) {
+            val entry = iter.next
+            val key = entry.getKey
+            val value = entry.getValue
+
+            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+          }
+        case v: scala.collection.Map[_, _] =>
+          writeType(dos, "map")
+          writeInt(dos, v.size)
+          v.foreach { case (key, value) =>
+            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+          }
 
         case _ =>
           writeType(dos, "jobj")
@@ -329,12 +369,11 @@ private[spark] object SerDe {
     out.writeDouble((value.getTime / 1000).toDouble + value.getNanos.toDouble / 1e9)
   }
 
-  // NOTE: Only works for ASCII right now
   def writeString(out: DataOutputStream, value: String): Unit = {
-    val len = value.length
-    out.writeInt(len + 1) // For the \0
-    out.writeBytes(value)
-    out.writeByte(0)
+    val utf8 = value.getBytes("UTF-8")
+    val len = utf8.length
+    out.writeInt(len)
+    out.write(utf8, 0, len)
   }
 
   def writeBytes(out: DataOutputStream, value: Array[Byte]): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
index 4b28866dcaa7c..7d160b6790eaa 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
@@ -175,8 +175,10 @@ private[deploy] object RPackageUtils extends Logging {
               print(s"ERROR: Failed to build R package in $file.", printStream)
               print(RJarDoc, printStream)
             }
-          } finally {
-            rSource.delete() // clean up
+          } finally { // clean up
+            if (!rSource.delete()) {
+              logWarning(s"Error deleting ${rSource.getPath()}")
+            }
           }
         } else {
           if (verbose) {
@@ -211,7 +213,9 @@ private[deploy] object RPackageUtils extends Logging {
     val filesToBundle = listFilesRecursively(dir, Seq(".zip"))
     // create a zip file from scratch, do not append to existing file.
     val zipFile = new File(dir, name)
-    zipFile.delete()
+    if (!zipFile.delete()) {
+      logWarning(s"Error deleting ${zipFile.getPath()}")
+    }
     val zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile, false))
     try {
       filesToBundle.foreach { file =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index 05b954ce36998..58cc1f9d963df 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -25,6 +25,7 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.api.r.{RBackend, RUtils}
+import org.apache.spark.{SparkException, SparkUserAppException}
 import org.apache.spark.util.RedirectThread
 
 /**
@@ -84,12 +85,15 @@ object RRunner {
       } finally {
         sparkRBackend.close()
       }
-      System.exit(returnCode)
+      if (returnCode != 0) {
+        throw new SparkUserAppException(returnCode)
+      }
     } else {
+      val errorMessage = s"SparkR backend did not initialize in $backendTimeout seconds"
       // scalastyle:off println
-      System.err.println("SparkR backend did not initialize in " + backendTimeout + " seconds")
+      System.err.println(errorMessage)
       // scalastyle:on println
-      System.exit(-1)
+      throw new SparkException(errorMessage)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index f7723ef5bde4c..d606b80c03c98 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -192,7 +192,9 @@ class SparkHadoopUtil extends Logging {
    * while it's interface in Hadoop 2.+.
    */
   def getConfigurationFromJobContext(context: JobContext): Configuration = {
+    // scalastyle:off jobconfig
     val method = context.getClass.getMethod("getConfiguration")
+    // scalastyle:on jobconfig
     method.invoke(context).asInstanceOf[Configuration]
   }
 
@@ -204,7 +206,9 @@ class SparkHadoopUtil extends Logging {
    */
   def getTaskAttemptIDFromTaskAttemptContext(
       context: MapReduceTaskAttemptContext): MapReduceTaskAttemptID = {
+    // scalastyle:off jobconfig
     val method = context.getClass.getMethod("getTaskAttemptID")
+    // scalastyle:on jobconfig
     method.invoke(context).asInstanceOf[MapReduceTaskAttemptID]
   }
 
@@ -381,20 +385,13 @@ class SparkHadoopUtil extends Logging {
 
 object SparkHadoopUtil {
 
-  private val hadoop = {
-    val yarnMode = java.lang.Boolean.valueOf(
-        System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
-    if (yarnMode) {
-      try {
-        Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
-          .newInstance()
-          .asInstanceOf[SparkHadoopUtil]
-      } catch {
-       case e: Exception => throw new SparkException("Unable to load YARN support", e)
-      }
-    } else {
-      new SparkHadoopUtil
-    }
+  private lazy val hadoop = new SparkHadoopUtil
+  private lazy val yarn = try {
+    Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
+      .newInstance()
+      .asInstanceOf[SparkHadoopUtil]
+  } catch {
+    case e: Exception => throw new SparkException("Unable to load YARN support", e)
   }
 
   val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp"
@@ -402,6 +399,13 @@ object SparkHadoopUtil {
   val SPARK_YARN_CREDS_COUNTER_DELIM = "-"
 
   def get: SparkHadoopUtil = {
-    hadoop
+    // Check each time to support changing to/from YARN
+    val yarnMode = java.lang.Boolean.valueOf(
+        System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
+    if (yarnMode) {
+      yarn
+    } else {
+      hadoop
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 86fcf942c2c4e..ad92f5635af35 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -319,8 +319,8 @@ object SparkSubmit {
 
     // The following modes are not supported or applicable
     (clusterManager, deployMode) match {
-      case (MESOS, CLUSTER) if args.isPython =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for python " +
+      case (MESOS, CLUSTER) if args.isR =>
+        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
           "applications on Mesos clusters.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
@@ -551,7 +551,15 @@ object SparkSubmit {
     if (isMesosCluster) {
       assert(args.useRest, "Mesos cluster mode is only supported through the REST submission API")
       childMainClass = "org.apache.spark.deploy.rest.RestSubmissionClient"
-      childArgs += (args.primaryResource, args.mainClass)
+      if (args.isPython) {
+        // Second argument is main class
+        childArgs += (args.primaryResource, "")
+        if (args.pyFiles != null) {
+          sysProps("spark.submit.pyFiles") = args.pyFiles
+        }
+      } else {
+        childArgs += (args.primaryResource, args.mainClass)
+      }
       if (args.childArgs != null) {
         childArgs ++= args.childArgs
       }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index e573ff16c50a3..5eb8adf97d90b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy.history
 
 import java.io.{BufferedInputStream, FileNotFoundException, InputStream, IOException, OutputStream}
+import java.util.UUID
 import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
@@ -73,7 +74,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // The modification time of the newest log detected during the last scan. This is used
   // to ignore logs that are older during subsequent scans, to avoid processing data that
   // is already known.
-  private var lastModifiedTime = -1L
+  private var lastScanTime = -1L
 
   // Mapping of application IDs to their metadata, in descending end time order. Apps are inserted
   // into the map in order, so the LinkedHashMap maintains the correct ordering.
@@ -145,16 +146,15 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           val ui = {
             val conf = this.conf.clone()
             val appSecManager = new SecurityManager(conf)
-            SparkUI.createHistoryUI(conf, replayBus, appSecManager, appId,
+            SparkUI.createHistoryUI(conf, replayBus, appSecManager, appInfo.name,
               HistoryServer.getAttemptURI(appId, attempt.attemptId), attempt.startTime)
             // Do not call ui.bind() to avoid creating a new server for each application
           }
           val appListener = new ApplicationEventListener()
           replayBus.addListener(appListener)
-          val appInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)), replayBus)
-          appInfo.map { info =>
-            ui.setAppName(s"${info.name} ($appId)")
-
+          val appAttemptInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)),
+            replayBus)
+          appAttemptInfo.map { info =>
             val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
             ui.getSecurityManager.setAcls(uiAclsEnabled)
             // make sure to set admin acls before view acls so they are properly picked up
@@ -179,15 +179,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
    */
   private[history] def checkForLogs(): Unit = {
     try {
+      val newLastScanTime = getNewLastScanTime()
       val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
         .getOrElse(Seq[FileStatus]())
-      var newLastModifiedTime = lastModifiedTime
       val logInfos: Seq[FileStatus] = statusList
         .filter { entry =>
           try {
             getModificationTime(entry).map { time =>
-              newLastModifiedTime = math.max(newLastModifiedTime, time)
-              time >= lastModifiedTime
+              time >= lastScanTime
             }.getOrElse(false)
           } catch {
             case e: AccessControlException =>
@@ -224,12 +223,31 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           }
         }
 
-      lastModifiedTime = newLastModifiedTime
+      lastScanTime = newLastScanTime
     } catch {
       case e: Exception => logError("Exception in checking for event log updates", e)
     }
   }
 
+  private def getNewLastScanTime(): Long = {
+    val fileName = "." + UUID.randomUUID().toString
+    val path = new Path(logDir, fileName)
+    val fos = fs.create(path)
+
+    try {
+      fos.close()
+      fs.getFileStatus(path).getModificationTime
+    } catch {
+      case e: Exception =>
+        logError("Exception encountered when attempting to update last scan time", e)
+        lastScanTime
+    } finally {
+      if (!fs.delete(path)) {
+        logWarning(s"Error deleting ${path}")
+      }
+    }
+  }
+
   override def writeEventLogs(
       appId: String,
       attemptId: Option[String],
@@ -389,7 +407,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         try {
           val path = new Path(logDir, attempt.logPath)
           if (fs.exists(path)) {
-            fs.delete(path, true)
+            if (!fs.delete(path, true)) {
+              logWarning(s"Error deleting ${path}")
+            }
           }
         } catch {
           case e: AccessControlException =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 0830cc1ba1245..b347cb3be69f7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -51,7 +51,10 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
     val hasMultipleAttempts = appsToShow.exists(_.attempts.size > 1)
     val appTable =
       if (hasMultipleAttempts) {
-        UIUtils.listingTable(appWithAttemptHeader, appWithAttemptRow, appsToShow)
+        // Sorting is disable here as table sort on rowspan has issues.
+        // ref. SPARK-10172
+        UIUtils.listingTable(appWithAttemptHeader, appWithAttemptRow,
+          appsToShow, sortable = false)
       } else {
         UIUtils.listingTable(appHeader, appRow, appsToShow)
       }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
index aa379d4cd61e7..1aa8cd5013b49 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
@@ -45,7 +45,10 @@ private[master] class FileSystemPersistenceEngine(
   }
 
   override def unpersist(name: String): Unit = {
-    new File(dir + File.separator + name).delete()
+    val f = new File(dir + File.separator + name)
+    if (!f.delete()) {
+      logWarning(s"Error deleting ${f.getPath()}")
+    }
   }
 
   override def read[T: ClassTag](prefix: String): Seq[T] = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 26904d39a9bec..d518e92133aad 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -944,7 +944,7 @@ private[deploy] class Master(
       val logInput = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)
       val replayBus = new ReplayListenerBus()
       val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf),
-        appName + status, HistoryServer.UI_PATH_PREFIX + s"/${app.id}", app.startTime)
+        appName, HistoryServer.UI_PATH_PREFIX + s"/${app.id}", app.startTime)
       val maybeTruncated = eventLogFile.endsWith(EventLoggingListener.IN_PROGRESS)
       try {
         replayBus.replay(logInput, eventLogFile, maybeTruncated)
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
index 1fe956320a1b8..957a928bc402b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
@@ -392,15 +392,14 @@ private[spark] object RestSubmissionClient {
       mainClass: String,
       appArgs: Array[String],
       conf: SparkConf,
-      env: Map[String, String] = sys.env): SubmitRestProtocolResponse = {
+      env: Map[String, String] = Map()): SubmitRestProtocolResponse = {
     val master = conf.getOption("spark.master").getOrElse {
       throw new IllegalArgumentException("'spark.master' must be set.")
     }
     val sparkProperties = conf.getAll.toMap
-    val environmentVariables = env.filter { case (k, _) => k.startsWith("SPARK_") }
     val client = new RestSubmissionClient(master)
     val submitRequest = client.constructSubmitRequest(
-      appResource, mainClass, appArgs, sparkProperties, environmentVariables)
+      appResource, mainClass, appArgs, sparkProperties, env)
     client.createSubmission(submitRequest)
   }
 
@@ -413,6 +412,16 @@ private[spark] object RestSubmissionClient {
     val mainClass = args(1)
     val appArgs = args.slice(2, args.size)
     val conf = new SparkConf
-    run(appResource, mainClass, appArgs, conf)
+    val env = filterSystemEnvironment(sys.env)
+    run(appResource, mainClass, appArgs, conf, env)
+  }
+
+  /**
+   * Filter non-spark environment variables from any environment.
+   */
+  private[rest] def filterSystemEnvironment(env: Map[String, String]): Map[String, String] = {
+    env.filter { case (k, _) =>
+      (k.startsWith("SPARK_") && k != "SPARK_ENV_LOADED") || k.startsWith("MESOS_")
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
index 709a27233598c..1a0598e50dcf1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
@@ -20,9 +20,8 @@ package org.apache.spark.deploy.worker.ui
 import java.io.File
 import javax.servlet.http.HttpServletRequest
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.Logging
 import org.apache.spark.deploy.worker.Worker
-import org.apache.spark.deploy.worker.ui.WorkerWebUI._
 import org.apache.spark.ui.{SparkUI, WebUI}
 import org.apache.spark.ui.JettyUtils._
 import org.apache.spark.util.RpcUtils
@@ -49,7 +48,9 @@ class WorkerWebUI(
     attachPage(new WorkerPage(this))
     attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static"))
     attachHandler(createServletHandler("/log",
-      (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr))
+      (request: HttpServletRequest) => logPage.renderLog(request),
+      worker.securityMgr,
+      worker.conf))
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
index f47d7ef511da1..7d84889a2def0 100644
--- a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
@@ -26,8 +26,8 @@ private[spark] class CommitDeniedException(
     msg: String,
     jobID: Int,
     splitID: Int,
-    attemptID: Int)
+    attemptNumber: Int)
   extends Exception(msg) {
 
-  def toTaskEndReason: TaskEndReason = TaskCommitDenied(jobID, splitID, attemptID)
+  def toTaskEndReason: TaskEndReason = TaskCommitDenied(jobID, splitID, attemptNumber)
 }
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index a5ad47293f1c2..e2ffc3b64e5db 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -131,8 +131,8 @@ private[spark] class StreamInputFormat extends StreamFileInputFormat[PortableDat
  */
 @Experimental
 class PortableDataStream(
-    @transient isplit: CombineFileSplit,
-    @transient context: TaskAttemptContext,
+    isplit: CombineFileSplit,
+    context: TaskAttemptContext,
     index: Integer)
   extends Serializable {
 
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 607d5a321efca..9dc36704a676d 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -148,7 +148,7 @@ class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
   try {
     Snappy.getNativeLibraryVersion
   } catch {
-    case e: Error => throw new IllegalArgumentException
+    case e: Error => throw new IllegalArgumentException(e)
   }
 
   override def compressedOutputStream(s: OutputStream): OutputStream = {
diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
index f405b732e4725..f7298e8d5c62c 100644
--- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
@@ -91,8 +91,7 @@ object SparkHadoopMapRedUtil extends Logging {
       committer: MapReduceOutputCommitter,
       mrTaskContext: MapReduceTaskAttemptContext,
       jobId: Int,
-      splitId: Int,
-      attemptId: Int): Unit = {
+      splitId: Int): Unit = {
 
     val mrTaskAttemptID = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(mrTaskContext)
 
@@ -122,7 +121,8 @@ object SparkHadoopMapRedUtil extends Logging {
 
       if (shouldCoordinateWithDriver) {
         val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator
-        val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, attemptId)
+        val taskAttemptNumber = TaskContext.get().attemptNumber()
+        val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber)
 
         if (canCommit) {
           performCommit()
@@ -132,7 +132,7 @@ object SparkHadoopMapRedUtil extends Logging {
           logInfo(message)
           // We need to abort the task so that the driver can reschedule new attempts, if necessary
           committer.abortTask(mrTaskContext)
-          throw new CommitDeniedException(message, jobId, splitId, attemptId)
+          throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber)
         }
       } else {
         // Speculation is disabled or a user has chosen to manually bypass the commit coordination
@@ -143,16 +143,4 @@ object SparkHadoopMapRedUtil extends Logging {
       logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID")
     }
   }
-
-  def commitTask(
-      committer: MapReduceOutputCommitter,
-      mrTaskContext: MapReduceTaskAttemptContext,
-      sparkTaskContext: TaskContext): Unit = {
-    commitTask(
-      committer,
-      mrTaskContext,
-      sparkTaskContext.stageId(),
-      sparkTaskContext.partitionId(),
-      sparkTaskContext.attemptNumber())
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 4517f465ebd3b..48afe3ae3511f 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -88,7 +88,7 @@ private[spark] class MetricsSystem private (
    */
   def getServletHandlers: Array[ServletContextHandler] = {
     require(running, "Can only call getServletHandlers on a running MetricsSystem")
-    metricsServlet.map(_.getHandlers).getOrElse(Array())
+    metricsServlet.map(_.getHandlers(conf)).getOrElse(Array())
   }
 
   metricsConfig.initialize()
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
index 0c2e212a33074..4193e1d21d3c1 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala
@@ -27,7 +27,7 @@ import com.codahale.metrics.json.MetricsModule
 import com.fasterxml.jackson.databind.ObjectMapper
 import org.eclipse.jetty.servlet.ServletContextHandler
 
-import org.apache.spark.SecurityManager
+import org.apache.spark.{SparkConf, SecurityManager}
 import org.apache.spark.ui.JettyUtils._
 
 private[spark] class MetricsServlet(
@@ -49,10 +49,10 @@ private[spark] class MetricsServlet(
   val mapper = new ObjectMapper().registerModule(
     new MetricsModule(TimeUnit.SECONDS, TimeUnit.MILLISECONDS, servletShowSample))
 
-  def getHandlers: Array[ServletContextHandler] = {
+  def getHandlers(conf: SparkConf): Array[ServletContextHandler] = {
     Array[ServletContextHandler](
       createServletHandler(servletPath,
-        new ServletParams(request => getMetricsSnapshot(request), "text/json"), securityMgr)
+        new ServletParams(request => getMetricsSnapshot(request), "text/json"), securityMgr, conf)
     )
   }
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
index 7c170a742fb64..76968249fb625 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -38,6 +38,7 @@ import org.apache.spark.storage.{BlockId, StorageLevel}
  * is equivalent to one Spark-level shuffle block.
  */
 class NettyBlockRpcServer(
+    appId: String,
     serializer: Serializer,
     blockManager: BlockDataManager)
   extends RpcHandler with Logging {
@@ -55,7 +56,7 @@ class NettyBlockRpcServer(
       case openBlocks: OpenBlocks =>
         val blocks: Seq[ManagedBuffer] =
           openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
-        val streamId = streamManager.registerStream(blocks.iterator.asJava)
+        val streamId = streamManager.registerStream(appId, blocks.iterator.asJava)
         logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
         responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index ff8aae9ebe9f0..70a42f9045e6b 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -49,7 +49,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
   private[this] var appId: String = _
 
   override def init(blockDataManager: BlockDataManager): Unit = {
-    val rpcHandler = new NettyBlockRpcServer(serializer, blockDataManager)
+    val rpcHandler = new NettyBlockRpcServer(conf.getAppId, serializer, blockDataManager)
     var serverBootstrap: Option[TransportServerBootstrap] = None
     var clientBootstrap: Option[TransportClientBootstrap] = None
     if (authEnabled) {
@@ -137,7 +137,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
       new RpcResponseCallback {
         override def onSuccess(response: Array[Byte]): Unit = {
           logTrace(s"Successfully uploaded block $blockId")
-          result.success()
+          result.success((): Unit)
         }
         override def onFailure(e: Throwable): Unit = {
           logError(s"Error while uploading block $blockId", e)
@@ -149,7 +149,11 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
   }
 
   override def close(): Unit = {
-    server.close()
-    clientFactory.close()
+    if (server != null) {
+      server.close()
+    }
+    if (clientFactory != null) {
+      clientFactory.close()
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
deleted file mode 100644
index 79cb0640c8672..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import org.apache.spark.storage.{BlockId, StorageLevel, TestBlockId}
-
-import scala.collection.mutable.{ArrayBuffer, StringBuilder}
-
-// private[spark] because we need to register them in Kryo
-private[spark] case class GetBlock(id: BlockId)
-private[spark] case class GotBlock(id: BlockId, data: ByteBuffer)
-private[spark] case class PutBlock(id: BlockId, data: ByteBuffer, level: StorageLevel)
-
-private[nio] class BlockMessage() {
-  // Un-initialized: typ = 0
-  // GetBlock: typ = 1
-  // GotBlock: typ = 2
-  // PutBlock: typ = 3
-  private var typ: Int = BlockMessage.TYPE_NON_INITIALIZED
-  private var id: BlockId = null
-  private var data: ByteBuffer = null
-  private var level: StorageLevel = null
-
-  def set(getBlock: GetBlock) {
-    typ = BlockMessage.TYPE_GET_BLOCK
-    id = getBlock.id
-  }
-
-  def set(gotBlock: GotBlock) {
-    typ = BlockMessage.TYPE_GOT_BLOCK
-    id = gotBlock.id
-    data = gotBlock.data
-  }
-
-  def set(putBlock: PutBlock) {
-    typ = BlockMessage.TYPE_PUT_BLOCK
-    id = putBlock.id
-    data = putBlock.data
-    level = putBlock.level
-  }
-
-  def set(buffer: ByteBuffer) {
-    typ = buffer.getInt()
-    val idLength = buffer.getInt()
-    val idBuilder = new StringBuilder(idLength)
-    for (i <- 1 to idLength) {
-      idBuilder += buffer.getChar()
-    }
-    id = BlockId(idBuilder.toString)
-
-    if (typ == BlockMessage.TYPE_PUT_BLOCK) {
-
-      val booleanInt = buffer.getInt()
-      val replication = buffer.getInt()
-      level = StorageLevel(booleanInt, replication)
-
-      val dataLength = buffer.getInt()
-      data = ByteBuffer.allocate(dataLength)
-      if (dataLength != buffer.remaining) {
-        throw new Exception("Error parsing buffer")
-      }
-      data.put(buffer)
-      data.flip()
-    } else if (typ == BlockMessage.TYPE_GOT_BLOCK) {
-
-      val dataLength = buffer.getInt()
-      data = ByteBuffer.allocate(dataLength)
-      if (dataLength != buffer.remaining) {
-        throw new Exception("Error parsing buffer")
-      }
-      data.put(buffer)
-      data.flip()
-    }
-
-  }
-
-  def set(bufferMsg: BufferMessage) {
-    val buffer = bufferMsg.buffers.apply(0)
-    buffer.clear()
-    set(buffer)
-  }
-
-  def getType: Int = typ
-  def getId: BlockId = id
-  def getData: ByteBuffer = data
-  def getLevel: StorageLevel = level
-
-  def toBufferMessage: BufferMessage = {
-    val buffers = new ArrayBuffer[ByteBuffer]()
-    var buffer = ByteBuffer.allocate(4 + 4 + id.name.length * 2)
-    buffer.putInt(typ).putInt(id.name.length)
-    id.name.foreach((x: Char) => buffer.putChar(x))
-    buffer.flip()
-    buffers += buffer
-
-    if (typ == BlockMessage.TYPE_PUT_BLOCK) {
-      buffer = ByteBuffer.allocate(8).putInt(level.toInt).putInt(level.replication)
-      buffer.flip()
-      buffers += buffer
-
-      buffer = ByteBuffer.allocate(4).putInt(data.remaining)
-      buffer.flip()
-      buffers += buffer
-
-      buffers += data
-    } else if (typ == BlockMessage.TYPE_GOT_BLOCK) {
-      buffer = ByteBuffer.allocate(4).putInt(data.remaining)
-      buffer.flip()
-      buffers += buffer
-
-      buffers += data
-    }
-
-    Message.createBufferMessage(buffers)
-  }
-
-  override def toString: String = {
-    "BlockMessage [type = " + typ + ", id = " + id + ", level = " + level +
-    ", data = " + (if (data != null) data.remaining.toString else "null") + "]"
-  }
-}
-
-private[nio] object BlockMessage {
-  val TYPE_NON_INITIALIZED: Int = 0
-  val TYPE_GET_BLOCK: Int = 1
-  val TYPE_GOT_BLOCK: Int = 2
-  val TYPE_PUT_BLOCK: Int = 3
-
-  def fromBufferMessage(bufferMessage: BufferMessage): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(bufferMessage)
-    newBlockMessage
-  }
-
-  def fromByteBuffer(buffer: ByteBuffer): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(buffer)
-    newBlockMessage
-  }
-
-  def fromGetBlock(getBlock: GetBlock): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(getBlock)
-    newBlockMessage
-  }
-
-  def fromGotBlock(gotBlock: GotBlock): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(gotBlock)
-    newBlockMessage
-  }
-
-  def fromPutBlock(putBlock: PutBlock): BlockMessage = {
-    val newBlockMessage = new BlockMessage()
-    newBlockMessage.set(putBlock)
-    newBlockMessage
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
deleted file mode 100644
index f1c9ea8b64ca3..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import org.apache.spark._
-import org.apache.spark.storage.{StorageLevel, TestBlockId}
-
-import scala.collection.mutable.ArrayBuffer
-
-private[nio]
-class BlockMessageArray(var blockMessages: Seq[BlockMessage])
-  extends Seq[BlockMessage] with Logging {
-
-  def this(bm: BlockMessage) = this(Array(bm))
-
-  def this() = this(null.asInstanceOf[Seq[BlockMessage]])
-
-  def apply(i: Int): BlockMessage = blockMessages(i)
-
-  def iterator: Iterator[BlockMessage] = blockMessages.iterator
-
-  def length: Int = blockMessages.length
-
-  def set(bufferMessage: BufferMessage) {
-    val startTime = System.currentTimeMillis
-    val newBlockMessages = new ArrayBuffer[BlockMessage]()
-    val buffer = bufferMessage.buffers(0)
-    buffer.clear()
-    while (buffer.remaining() > 0) {
-      val size = buffer.getInt()
-      logDebug("Creating block message of size " + size + " bytes")
-      val newBuffer = buffer.slice()
-      newBuffer.clear()
-      newBuffer.limit(size)
-      logDebug("Trying to convert buffer " + newBuffer + " to block message")
-      val newBlockMessage = BlockMessage.fromByteBuffer(newBuffer)
-      logDebug("Created " + newBlockMessage)
-      newBlockMessages += newBlockMessage
-      buffer.position(buffer.position() + size)
-    }
-    val finishTime = System.currentTimeMillis
-    logDebug("Converted block message array from buffer message in " +
-      (finishTime - startTime) / 1000.0  + " s")
-    this.blockMessages = newBlockMessages
-  }
-
-  def toBufferMessage: BufferMessage = {
-    val buffers = new ArrayBuffer[ByteBuffer]()
-
-    blockMessages.foreach(blockMessage => {
-      val bufferMessage = blockMessage.toBufferMessage
-      logDebug("Adding " + blockMessage)
-      val sizeBuffer = ByteBuffer.allocate(4).putInt(bufferMessage.size)
-      sizeBuffer.flip
-      buffers += sizeBuffer
-      buffers ++= bufferMessage.buffers
-      logDebug("Added " + bufferMessage)
-    })
-
-    logDebug("Buffer list:")
-    buffers.foreach((x: ByteBuffer) => logDebug("" + x))
-    Message.createBufferMessage(buffers)
-  }
-}
-
-private[nio] object BlockMessageArray extends Logging {
-
-  def fromBufferMessage(bufferMessage: BufferMessage): BlockMessageArray = {
-    val newBlockMessageArray = new BlockMessageArray()
-    newBlockMessageArray.set(bufferMessage)
-    newBlockMessageArray
-  }
-
-  def main(args: Array[String]) {
-    val blockMessages =
-      (0 until 10).map { i =>
-        if (i % 2 == 0) {
-          val buffer = ByteBuffer.allocate(100)
-          buffer.clear()
-          BlockMessage.fromPutBlock(PutBlock(TestBlockId(i.toString), buffer,
-            StorageLevel.MEMORY_ONLY_SER))
-        } else {
-          BlockMessage.fromGetBlock(GetBlock(TestBlockId(i.toString)))
-        }
-      }
-    val blockMessageArray = new BlockMessageArray(blockMessages)
-    logDebug("Block message array created")
-
-    val bufferMessage = blockMessageArray.toBufferMessage
-    logDebug("Converted to buffer message")
-
-    val totalSize = bufferMessage.size
-    val newBuffer = ByteBuffer.allocate(totalSize)
-    newBuffer.clear()
-    bufferMessage.buffers.foreach(buffer => {
-      assert (0 == buffer.position())
-      newBuffer.put(buffer)
-      buffer.rewind()
-    })
-    newBuffer.flip
-    val newBufferMessage = Message.createBufferMessage(newBuffer)
-    logDebug("Copied to new buffer message, size = " + newBufferMessage.size)
-
-    val newBlockMessageArray = BlockMessageArray.fromBufferMessage(newBufferMessage)
-    logDebug("Converted back to block message array")
-    // scalastyle:off println
-    newBlockMessageArray.foreach(blockMessage => {
-      blockMessage.getType match {
-        case BlockMessage.TYPE_PUT_BLOCK => {
-          val pB = PutBlock(blockMessage.getId, blockMessage.getData, blockMessage.getLevel)
-          println(pB)
-        }
-        case BlockMessage.TYPE_GET_BLOCK => {
-          val gB = new GetBlock(blockMessage.getId)
-          println(gB)
-        }
-      }
-    })
-    // scalastyle:on println
-  }
-}
-
-
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala
deleted file mode 100644
index 9a9e22b0c2366..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/BufferMessage.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.storage.BlockManager
-
-
-private[nio]
-class BufferMessage(id_ : Int, val buffers: ArrayBuffer[ByteBuffer], var ackId: Int)
-  extends Message(Message.BUFFER_MESSAGE, id_) {
-
-  val initialSize = currentSize()
-  var gotChunkForSendingOnce = false
-
-  def size: Int = initialSize
-
-  def currentSize(): Int = {
-    if (buffers == null || buffers.isEmpty) {
-      0
-    } else {
-      buffers.map(_.remaining).reduceLeft(_ + _)
-    }
-  }
-
-  def getChunkForSending(maxChunkSize: Int): Option[MessageChunk] = {
-    if (maxChunkSize <= 0) {
-      throw new Exception("Max chunk size is " + maxChunkSize)
-    }
-
-    val security = if (isSecurityNeg) 1 else 0
-    if (size == 0 && !gotChunkForSendingOnce) {
-      val newChunk = new MessageChunk(
-        new MessageChunkHeader(typ, id, 0, 0, ackId, hasError, security, senderAddress), null)
-      gotChunkForSendingOnce = true
-      return Some(newChunk)
-    }
-
-    while(!buffers.isEmpty) {
-      val buffer = buffers(0)
-      if (buffer.remaining == 0) {
-        BlockManager.dispose(buffer)
-        buffers -= buffer
-      } else {
-        val newBuffer = if (buffer.remaining <= maxChunkSize) {
-          buffer.duplicate()
-        } else {
-          buffer.slice().limit(maxChunkSize).asInstanceOf[ByteBuffer]
-        }
-        buffer.position(buffer.position + newBuffer.remaining)
-        val newChunk = new MessageChunk(new MessageChunkHeader(
-          typ, id, size, newBuffer.remaining, ackId,
-          hasError, security, senderAddress), newBuffer)
-        gotChunkForSendingOnce = true
-        return Some(newChunk)
-      }
-    }
-    None
-  }
-
-  def getChunkForReceiving(chunkSize: Int): Option[MessageChunk] = {
-    // STRONG ASSUMPTION: BufferMessage created when receiving data has ONLY ONE data buffer
-    if (buffers.size > 1) {
-      throw new Exception("Attempting to get chunk from message with multiple data buffers")
-    }
-    val buffer = buffers(0)
-    val security = if (isSecurityNeg) 1 else 0
-    if (buffer.remaining > 0) {
-      if (buffer.remaining < chunkSize) {
-        throw new Exception("Not enough space in data buffer for receiving chunk")
-      }
-      val newBuffer = buffer.slice().limit(chunkSize).asInstanceOf[ByteBuffer]
-      buffer.position(buffer.position + newBuffer.remaining)
-      val newChunk = new MessageChunk(new MessageChunkHeader(
-          typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer)
-      return Some(newChunk)
-    }
-    None
-  }
-
-  def flip() {
-    buffers.foreach(_.flip)
-  }
-
-  def hasAckId(): Boolean = ackId != 0
-
-  def isCompletelyReceived: Boolean = !buffers(0).hasRemaining
-
-  override def toString: String = {
-    if (hasAckId) {
-      "BufferAckMessage(aid = " + ackId + ", id = " + id + ", size = " + size + ")"
-    } else {
-      "BufferMessage(id = " + id + ", size = " + size + ")"
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
deleted file mode 100644
index 8d9ebadaf79d4..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net._
-import java.nio._
-import java.nio.channels._
-import java.util.concurrent.ConcurrentLinkedQueue
-import java.util.LinkedList
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.{ArrayBuffer, HashMap}
-import scala.util.control.NonFatal
-
-import org.apache.spark._
-import org.apache.spark.network.sasl.{SparkSaslClient, SparkSaslServer}
-
-private[nio]
-abstract class Connection(val channel: SocketChannel, val selector: Selector,
-    val socketRemoteConnectionManagerId: ConnectionManagerId, val connectionId: ConnectionId,
-    val securityMgr: SecurityManager)
-  extends Logging {
-
-  var sparkSaslServer: SparkSaslServer = null
-  var sparkSaslClient: SparkSaslClient = null
-
-  def this(channel_ : SocketChannel, selector_ : Selector, id_ : ConnectionId,
-      securityMgr_ : SecurityManager) = {
-    this(channel_, selector_,
-      ConnectionManagerId.fromSocketAddress(
-        channel_.socket.getRemoteSocketAddress.asInstanceOf[InetSocketAddress]),
-        id_, securityMgr_)
-  }
-
-  channel.configureBlocking(false)
-  channel.socket.setTcpNoDelay(true)
-  channel.socket.setReuseAddress(true)
-  channel.socket.setKeepAlive(true)
-  /* channel.socket.setReceiveBufferSize(32768) */
-
-  @volatile private var closed = false
-  var onCloseCallback: Connection => Unit = null
-  val onExceptionCallbacks = new ConcurrentLinkedQueue[(Connection, Throwable) => Unit]
-  var onKeyInterestChangeCallback: (Connection, Int) => Unit = null
-
-  val remoteAddress = getRemoteAddress()
-
-  def isSaslComplete(): Boolean
-
-  def resetForceReregister(): Boolean
-
-  // Read channels typically do not register for write and write does not for read
-  // Now, we do have write registering for read too (temporarily), but this is to detect
-  // channel close NOT to actually read/consume data on it !
-  // How does this work if/when we move to SSL ?
-
-  // What is the interest to register with selector for when we want this connection to be selected
-  def registerInterest()
-
-  // What is the interest to register with selector for when we want this connection to
-  // be de-selected
-  // Traditionally, 0 - but in our case, for example, for close-detection on SendingConnection hack,
-  // it will be SelectionKey.OP_READ (until we fix it properly)
-  def unregisterInterest()
-
-  // On receiving a read event, should we change the interest for this channel or not ?
-  // Will be true for ReceivingConnection, false for SendingConnection.
-  def changeInterestForRead(): Boolean
-
-  private def disposeSasl() {
-    if (sparkSaslServer != null) {
-      sparkSaslServer.dispose()
-    }
-
-    if (sparkSaslClient != null) {
-      sparkSaslClient.dispose()
-    }
-  }
-
-  // On receiving a write event, should we change the interest for this channel or not ?
-  // Will be false for ReceivingConnection, true for SendingConnection.
-  // Actually, for now, should not get triggered for ReceivingConnection
-  def changeInterestForWrite(): Boolean
-
-  def getRemoteConnectionManagerId(): ConnectionManagerId = {
-    socketRemoteConnectionManagerId
-  }
-
-  def key(): SelectionKey = channel.keyFor(selector)
-
-  def getRemoteAddress(): InetSocketAddress = {
-    channel.socket.getRemoteSocketAddress().asInstanceOf[InetSocketAddress]
-  }
-
-  // Returns whether we have to register for further reads or not.
-  def read(): Boolean = {
-    throw new UnsupportedOperationException(
-      "Cannot read on connection of type " + this.getClass.toString)
-  }
-
-  // Returns whether we have to register for further writes or not.
-  def write(): Boolean = {
-    throw new UnsupportedOperationException(
-      "Cannot write on connection of type " + this.getClass.toString)
-  }
-
-  def close() {
-    closed = true
-    val k = key()
-    if (k != null) {
-      k.cancel()
-    }
-    channel.close()
-    disposeSasl()
-    callOnCloseCallback()
-  }
-
-  protected def isClosed: Boolean = closed
-
-  def onClose(callback: Connection => Unit) {
-    onCloseCallback = callback
-  }
-
-  def onException(callback: (Connection, Throwable) => Unit) {
-    onExceptionCallbacks.add(callback)
-  }
-
-  def onKeyInterestChange(callback: (Connection, Int) => Unit) {
-    onKeyInterestChangeCallback = callback
-  }
-
-  def callOnExceptionCallbacks(e: Throwable) {
-    onExceptionCallbacks.asScala.foreach {
-      callback =>
-        try {
-          callback(this, e)
-        } catch {
-          case NonFatal(e) => {
-            logWarning("Ignored error in onExceptionCallback", e)
-          }
-        }
-    }
-  }
-
-  def callOnCloseCallback() {
-    if (onCloseCallback != null) {
-      onCloseCallback(this)
-    } else {
-      logWarning("Connection to " + getRemoteConnectionManagerId() +
-        " closed and OnExceptionCallback not registered")
-    }
-
-  }
-
-  def changeConnectionKeyInterest(ops: Int) {
-    if (onKeyInterestChangeCallback != null) {
-      onKeyInterestChangeCallback(this, ops)
-    } else {
-      throw new Exception("OnKeyInterestChangeCallback not registered")
-    }
-  }
-
-  def printRemainingBuffer(buffer: ByteBuffer) {
-    val bytes = new Array[Byte](buffer.remaining)
-    val curPosition = buffer.position
-    buffer.get(bytes)
-    bytes.foreach(x => print(x + " "))
-    buffer.position(curPosition)
-    print(" (" + bytes.length + ")")
-  }
-
-  def printBuffer(buffer: ByteBuffer, position: Int, length: Int) {
-    val bytes = new Array[Byte](length)
-    val curPosition = buffer.position
-    buffer.position(position)
-    buffer.get(bytes)
-    bytes.foreach(x => print(x + " "))
-    print(" (" + position + ", " + length + ")")
-    buffer.position(curPosition)
-  }
-}
-
-
-private[nio]
-class SendingConnection(val address: InetSocketAddress, selector_ : Selector,
-    remoteId_ : ConnectionManagerId, id_ : ConnectionId,
-    securityMgr_ : SecurityManager)
-  extends Connection(SocketChannel.open, selector_, remoteId_, id_, securityMgr_) {
-
-  def isSaslComplete(): Boolean = {
-    if (sparkSaslClient != null) sparkSaslClient.isComplete() else false
-  }
-
-  private class Outbox {
-    val messages = new LinkedList[Message]()
-    val defaultChunkSize = 65536
-    var nextMessageToBeUsed = 0
-
-    def addMessage(message: Message) {
-      messages.synchronized {
-        messages.add(message)
-        logDebug("Added [" + message + "] to outbox for sending to " +
-          "[" + getRemoteConnectionManagerId() + "]")
-      }
-    }
-
-    def getChunk(): Option[MessageChunk] = {
-      messages.synchronized {
-        while (!messages.isEmpty) {
-          /* nextMessageToBeUsed = nextMessageToBeUsed % messages.size */
-          /* val message = messages(nextMessageToBeUsed) */
-
-          val message = if (securityMgr.isAuthenticationEnabled() && !isSaslComplete()) {
-            // only allow sending of security messages until sasl is complete
-            var pos = 0
-            var securityMsg: Message = null
-            while (pos < messages.size() && securityMsg == null) {
-              if (messages.get(pos).isSecurityNeg) {
-                securityMsg = messages.remove(pos)
-              }
-              pos = pos + 1
-            }
-            // didn't find any security messages and auth isn't completed so return
-            if (securityMsg == null) return None
-            securityMsg
-          } else {
-            messages.removeFirst()
-          }
-
-          val chunk = message.getChunkForSending(defaultChunkSize)
-          if (chunk.isDefined) {
-            messages.add(message)
-            nextMessageToBeUsed = nextMessageToBeUsed + 1
-            if (!message.started) {
-              logDebug(
-                "Starting to send [" + message + "] to [" + getRemoteConnectionManagerId() + "]")
-              message.started = true
-              message.startTime = System.currentTimeMillis
-            }
-            logTrace(
-              "Sending chunk from [" + message + "] to [" + getRemoteConnectionManagerId() + "]")
-            return chunk
-          } else {
-            message.finishTime = System.currentTimeMillis
-            logDebug("Finished sending [" + message + "] to [" + getRemoteConnectionManagerId() +
-              "] in "  + message.timeTaken )
-          }
-        }
-      }
-      None
-    }
-  }
-
-  // outbox is used as a lock - ensure that it is always used as a leaf (since methods which
-  // lock it are invoked in context of other locks)
-  private val outbox = new Outbox()
-  /*
-    This is orthogonal to whether we have pending bytes to write or not - and satisfies a slightly
-    different purpose. This flag is to see if we need to force reregister for write even when we
-    do not have any pending bytes to write to socket.
-    This can happen due to a race between adding pending buffers, and checking for existing of
-    data as detailed in https://github.com/mesos/spark/pull/791
-   */
-  private var needForceReregister = false
-
-  val currentBuffers = new ArrayBuffer[ByteBuffer]()
-
-  /* channel.socket.setSendBufferSize(256 * 1024) */
-
-  override def getRemoteAddress(): InetSocketAddress = address
-
-  val DEFAULT_INTEREST = SelectionKey.OP_READ
-
-  override def registerInterest() {
-    // Registering read too - does not really help in most cases, but for some
-    // it does - so let us keep it for now.
-    changeConnectionKeyInterest(SelectionKey.OP_WRITE | DEFAULT_INTEREST)
-  }
-
-  override def unregisterInterest() {
-    changeConnectionKeyInterest(DEFAULT_INTEREST)
-  }
-
-  def registerAfterAuth(): Unit = {
-    outbox.synchronized {
-      needForceReregister = true
-    }
-    if (channel.isConnected) {
-      registerInterest()
-    }
-  }
-
-  def send(message: Message) {
-    outbox.synchronized {
-      outbox.addMessage(message)
-      needForceReregister = true
-    }
-    if (channel.isConnected) {
-      registerInterest()
-    }
-  }
-
-  // return previous value after resetting it.
-  def resetForceReregister(): Boolean = {
-    outbox.synchronized {
-      val result = needForceReregister
-      needForceReregister = false
-      result
-    }
-  }
-
-  // MUST be called within the selector loop
-  def connect() {
-    try {
-      channel.register(selector, SelectionKey.OP_CONNECT)
-      channel.connect(address)
-      logInfo("Initiating connection to [" + address + "]")
-    } catch {
-      case e: Exception =>
-        logError("Error connecting to " + address, e)
-        callOnExceptionCallbacks(e)
-    }
-  }
-
-  def finishConnect(force: Boolean): Boolean = {
-    try {
-      // Typically, this should finish immediately since it was triggered by a connect
-      // selection - though need not necessarily always complete successfully.
-      val connected = channel.finishConnect
-      if (!force && !connected) {
-        logInfo(
-          "finish connect failed [" + address + "], " + outbox.messages.size + " messages pending")
-        return false
-      }
-
-      // Fallback to previous behavior - assume finishConnect completed
-      // This will happen only when finishConnect failed for some repeated number of times
-      // (10 or so)
-      // Is highly unlikely unless there was an unclean close of socket, etc
-      registerInterest()
-      logInfo("Connected to [" + address + "], " + outbox.messages.size + " messages pending")
-    } catch {
-      case e: Exception => {
-        logWarning("Error finishing connection to " + address, e)
-        callOnExceptionCallbacks(e)
-      }
-    }
-    true
-  }
-
-  override def write(): Boolean = {
-    try {
-      while (true) {
-        if (currentBuffers.size == 0) {
-          outbox.synchronized {
-            outbox.getChunk() match {
-              case Some(chunk) => {
-                val buffers = chunk.buffers
-                // If we have 'seen' pending messages, then reset flag - since we handle that as
-                // normal registering of event (below)
-                if (needForceReregister && buffers.exists(_.remaining() > 0)) resetForceReregister()
-
-                currentBuffers ++= buffers
-              }
-              case None => {
-                // changeConnectionKeyInterest(0)
-                /* key.interestOps(0) */
-                return false
-              }
-            }
-          }
-        }
-
-        if (currentBuffers.size > 0) {
-          val buffer = currentBuffers(0)
-          val remainingBytes = buffer.remaining
-          val writtenBytes = channel.write(buffer)
-          if (buffer.remaining == 0) {
-            currentBuffers -= buffer
-          }
-          if (writtenBytes < remainingBytes) {
-            // re-register for write.
-            return true
-          }
-        }
-      }
-    } catch {
-      case e: Exception => {
-        logWarning("Error writing in connection to " + getRemoteConnectionManagerId(), e)
-        callOnExceptionCallbacks(e)
-        close()
-        return false
-      }
-    }
-    // should not happen - to keep scala compiler happy
-    true
-  }
-
-  // This is a hack to determine if remote socket was closed or not.
-  // SendingConnection DOES NOT expect to receive any data - if it does, it is an error
-  // For a bunch of cases, read will return -1 in case remote socket is closed : hence we
-  // register for reads to determine that.
-  override def read(): Boolean = {
-    // We don't expect the other side to send anything; so, we just read to detect an error or EOF.
-    try {
-      val length = channel.read(ByteBuffer.allocate(1))
-      if (length == -1) { // EOF
-        close()
-      } else if (length > 0) {
-        logWarning(
-          "Unexpected data read from SendingConnection to " + getRemoteConnectionManagerId())
-      }
-    } catch {
-      case e: Exception =>
-        logError("Exception while reading SendingConnection to " + getRemoteConnectionManagerId(),
-          e)
-        callOnExceptionCallbacks(e)
-        close()
-    }
-
-    false
-  }
-
-  override def changeInterestForRead(): Boolean = false
-
-  override def changeInterestForWrite(): Boolean = ! isClosed
-}
-
-
-// Must be created within selector loop - else deadlock
-private[spark] class ReceivingConnection(
-    channel_ : SocketChannel,
-    selector_ : Selector,
-    id_ : ConnectionId,
-    securityMgr_ : SecurityManager)
-    extends Connection(channel_, selector_, id_, securityMgr_) {
-
-  def isSaslComplete(): Boolean = {
-    if (sparkSaslServer != null) sparkSaslServer.isComplete() else false
-  }
-
-  class Inbox() {
-    val messages = new HashMap[Int, BufferMessage]()
-
-    def getChunk(header: MessageChunkHeader): Option[MessageChunk] = {
-
-      def createNewMessage: BufferMessage = {
-        val newMessage = Message.create(header).asInstanceOf[BufferMessage]
-        newMessage.started = true
-        newMessage.startTime = System.currentTimeMillis
-        newMessage.isSecurityNeg = header.securityNeg == 1
-        logDebug(
-          "Starting to receive [" + newMessage + "] from [" + getRemoteConnectionManagerId() + "]")
-        messages += ((newMessage.id, newMessage))
-        newMessage
-      }
-
-      val message = messages.getOrElseUpdate(header.id, createNewMessage)
-      logTrace(
-        "Receiving chunk of [" + message + "] from [" + getRemoteConnectionManagerId() + "]")
-      message.getChunkForReceiving(header.chunkSize)
-    }
-
-    def getMessageForChunk(chunk: MessageChunk): Option[BufferMessage] = {
-      messages.get(chunk.header.id)
-    }
-
-    def removeMessage(message: Message) {
-      messages -= message.id
-    }
-  }
-
-  @volatile private var inferredRemoteManagerId: ConnectionManagerId = null
-
-  override def getRemoteConnectionManagerId(): ConnectionManagerId = {
-    val currId = inferredRemoteManagerId
-    if (currId != null) currId else super.getRemoteConnectionManagerId()
-  }
-
-  // The receiver's remote address is the local socket on remote side : which is NOT
-  // the connection manager id of the receiver.
-  // We infer that from the messages we receive on the receiver socket.
-  private def processConnectionManagerId(header: MessageChunkHeader) {
-    val currId = inferredRemoteManagerId
-    if (header.address == null || currId != null) return
-
-    val managerId = ConnectionManagerId.fromSocketAddress(header.address)
-
-    if (managerId != null) {
-      inferredRemoteManagerId = managerId
-    }
-  }
-
-
-  val inbox = new Inbox()
-  val headerBuffer: ByteBuffer = ByteBuffer.allocate(MessageChunkHeader.HEADER_SIZE)
-  var onReceiveCallback: (Connection, Message) => Unit = null
-  var currentChunk: MessageChunk = null
-
-  channel.register(selector, SelectionKey.OP_READ)
-
-  override def read(): Boolean = {
-    try {
-      while (true) {
-        if (currentChunk == null) {
-          val headerBytesRead = channel.read(headerBuffer)
-          if (headerBytesRead == -1) {
-            close()
-            return false
-          }
-          if (headerBuffer.remaining > 0) {
-            // re-register for read event ...
-            return true
-          }
-          headerBuffer.flip
-          if (headerBuffer.remaining != MessageChunkHeader.HEADER_SIZE) {
-            throw new Exception(
-              "Unexpected number of bytes (" + headerBuffer.remaining + ") in the header")
-          }
-          val header = MessageChunkHeader.create(headerBuffer)
-          headerBuffer.clear()
-
-          processConnectionManagerId(header)
-
-          header.typ match {
-            case Message.BUFFER_MESSAGE => {
-              if (header.totalSize == 0) {
-                if (onReceiveCallback != null) {
-                  onReceiveCallback(this, Message.create(header))
-                }
-                currentChunk = null
-                // re-register for read event ...
-                return true
-              } else {
-                currentChunk = inbox.getChunk(header).orNull
-              }
-            }
-            case _ => throw new Exception("Message of unknown type received")
-          }
-        }
-
-        if (currentChunk == null) throw new Exception("No message chunk to receive data")
-
-        val bytesRead = channel.read(currentChunk.buffer)
-        if (bytesRead == 0) {
-          // re-register for read event ...
-          return true
-        } else if (bytesRead == -1) {
-          close()
-          return false
-        }
-
-        /* logDebug("Read " + bytesRead + " bytes for the buffer") */
-
-        if (currentChunk.buffer.remaining == 0) {
-          /* println("Filled buffer at " + System.currentTimeMillis) */
-          val bufferMessage = inbox.getMessageForChunk(currentChunk).get
-          if (bufferMessage.isCompletelyReceived) {
-            bufferMessage.flip()
-            bufferMessage.finishTime = System.currentTimeMillis
-            logDebug("Finished receiving [" + bufferMessage + "] from " +
-              "[" + getRemoteConnectionManagerId() + "] in " + bufferMessage.timeTaken)
-            if (onReceiveCallback != null) {
-              onReceiveCallback(this, bufferMessage)
-            }
-            inbox.removeMessage(bufferMessage)
-          }
-          currentChunk = null
-        }
-      }
-    } catch {
-      case e: Exception => {
-        logWarning("Error reading from connection to " + getRemoteConnectionManagerId(), e)
-        callOnExceptionCallbacks(e)
-        close()
-        return false
-      }
-    }
-    // should not happen - to keep scala compiler happy
-    true
-  }
-
-  def onReceive(callback: (Connection, Message) => Unit) {onReceiveCallback = callback}
-
-  // override def changeInterestForRead(): Boolean = ! isClosed
-  override def changeInterestForRead(): Boolean = true
-
-  override def changeInterestForWrite(): Boolean = {
-    throw new IllegalStateException("Unexpected invocation right now")
-  }
-
-  override def registerInterest() {
-    // Registering read too - does not really help in most cases, but for some
-    // it does - so let us keep it for now.
-    changeConnectionKeyInterest(SelectionKey.OP_READ)
-  }
-
-  override def unregisterInterest() {
-    changeConnectionKeyInterest(0)
-  }
-
-  // For read conn, always false.
-  override def resetForceReregister(): Boolean = false
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala
deleted file mode 100644
index b3b281ff465f1..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionId.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-private[nio] case class ConnectionId(connectionManagerId: ConnectionManagerId, uniqId: Int) {
-  override def toString: String = {
-    connectionManagerId.host + "_" + connectionManagerId.port + "_" + uniqId
-  }
-}
-
-private[nio] object ConnectionId {
-
-  def createConnectionIdFromString(connectionIdString: String): ConnectionId = {
-    val res = connectionIdString.split("_").map(_.trim())
-    if (res.size != 3) {
-      throw new Exception("Error converting ConnectionId string: " + connectionIdString +
-        " to a ConnectionId Object")
-    }
-    new ConnectionId(new ConnectionManagerId(res(0), res(1).toInt), res(2).toInt)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
deleted file mode 100644
index 9143918790381..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
+++ /dev/null
@@ -1,1157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.io.IOException
-import java.lang.ref.WeakReference
-import java.net._
-import java.nio._
-import java.nio.channels._
-import java.nio.channels.spi._
-import java.util.concurrent.atomic.AtomicInteger
-import java.util.concurrent.{LinkedBlockingDeque, ThreadPoolExecutor, TimeUnit}
-
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, SynchronizedMap, SynchronizedQueue}
-import scala.concurrent.duration._
-import scala.concurrent.{Await, ExecutionContext, Future, Promise}
-import scala.language.postfixOps
-
-import com.google.common.base.Charsets.UTF_8
-import io.netty.util.{Timeout, TimerTask, HashedWheelTimer}
-
-import org.apache.spark._
-import org.apache.spark.network.sasl.{SparkSaslClient, SparkSaslServer}
-import org.apache.spark.util.{ThreadUtils, Utils}
-
-import scala.util.Try
-import scala.util.control.NonFatal
-
-private[nio] class ConnectionManager(
-    port: Int,
-    conf: SparkConf,
-    securityManager: SecurityManager,
-    name: String = "Connection manager")
-  extends Logging {
-
-  /**
-   * Used by sendMessageReliably to track messages being sent.
-   * @param message the message that was sent
-   * @param connectionManagerId the connection manager that sent this message
-   * @param completionHandler callback that's invoked when the send has completed or failed
-   */
-  class MessageStatus(
-      val message: Message,
-      val connectionManagerId: ConnectionManagerId,
-      completionHandler: Try[Message] => Unit) {
-
-    def success(ackMessage: Message) {
-      if (ackMessage == null) {
-        failure(new NullPointerException)
-      }
-      else {
-        completionHandler(scala.util.Success(ackMessage))
-      }
-    }
-
-    def failWithoutAck() {
-      completionHandler(scala.util.Failure(new IOException("Failed without being ACK'd")))
-    }
-
-    def failure(e: Throwable) {
-      completionHandler(scala.util.Failure(e))
-    }
-  }
-
-  private val selector = SelectorProvider.provider.openSelector()
-  private val ackTimeoutMonitor =
-    new HashedWheelTimer(ThreadUtils.namedThreadFactory("AckTimeoutMonitor"))
-
-  private val ackTimeout =
-    conf.getTimeAsSeconds("spark.core.connection.ack.wait.timeout",
-      conf.get("spark.network.timeout", "120s"))
-
-  // Get the thread counts from the Spark Configuration.
-  //
-  // Even though the ThreadPoolExecutor constructor takes both a minimum and maximum value,
-  // we only query for the minimum value because we are using LinkedBlockingDeque.
-  //
-  // The JavaDoc for ThreadPoolExecutor points out that when using a LinkedBlockingDeque (which is
-  // an unbounded queue) no more than corePoolSize threads will ever be created, so only the "min"
-  // parameter is necessary.
-  private val handlerThreadCount = conf.getInt("spark.core.connection.handler.threads.min", 20)
-  private val ioThreadCount = conf.getInt("spark.core.connection.io.threads.min", 4)
-  private val connectThreadCount = conf.getInt("spark.core.connection.connect.threads.min", 1)
-
-  private val handleMessageExecutor = new ThreadPoolExecutor(
-    handlerThreadCount,
-    handlerThreadCount,
-    conf.getInt("spark.core.connection.handler.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable](),
-    ThreadUtils.namedThreadFactory("handle-message-executor")) {
-
-    override def afterExecute(r: Runnable, t: Throwable): Unit = {
-      super.afterExecute(r, t)
-      if (t != null && NonFatal(t)) {
-        logError("Error in handleMessageExecutor is not handled properly", t)
-      }
-    }
-  }
-
-  private val handleReadWriteExecutor = new ThreadPoolExecutor(
-    ioThreadCount,
-    ioThreadCount,
-    conf.getInt("spark.core.connection.io.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable](),
-    ThreadUtils.namedThreadFactory("handle-read-write-executor")) {
-
-    override def afterExecute(r: Runnable, t: Throwable): Unit = {
-      super.afterExecute(r, t)
-      if (t != null && NonFatal(t)) {
-        logError("Error in handleReadWriteExecutor is not handled properly", t)
-      }
-    }
-  }
-
-  // Use a different, yet smaller, thread pool - infrequently used with very short lived tasks :
-  // which should be executed asap
-  private val handleConnectExecutor = new ThreadPoolExecutor(
-    connectThreadCount,
-    connectThreadCount,
-    conf.getInt("spark.core.connection.connect.threads.keepalive", 60), TimeUnit.SECONDS,
-    new LinkedBlockingDeque[Runnable](),
-    ThreadUtils.namedThreadFactory("handle-connect-executor")) {
-
-    override def afterExecute(r: Runnable, t: Throwable): Unit = {
-      super.afterExecute(r, t)
-      if (t != null && NonFatal(t)) {
-        logError("Error in handleConnectExecutor is not handled properly", t)
-      }
-    }
-  }
-
-  private val serverChannel = ServerSocketChannel.open()
-  // used to track the SendingConnections waiting to do SASL negotiation
-  private val connectionsAwaitingSasl = new HashMap[ConnectionId, SendingConnection]
-    with SynchronizedMap[ConnectionId, SendingConnection]
-  private val connectionsByKey =
-    new HashMap[SelectionKey, Connection] with SynchronizedMap[SelectionKey, Connection]
-  private val connectionsById = new HashMap[ConnectionManagerId, SendingConnection]
-    with SynchronizedMap[ConnectionManagerId, SendingConnection]
-  // Tracks sent messages for which we are awaiting acknowledgements.  Entries are added to this
-  // map when messages are sent and are removed when acknowledgement messages are received or when
-  // acknowledgement timeouts expire
-  private val messageStatuses = new HashMap[Int, MessageStatus]  // [MessageId, MessageStatus]
-  private val keyInterestChangeRequests = new SynchronizedQueue[(SelectionKey, Int)]
-  private val registerRequests = new SynchronizedQueue[SendingConnection]
-
-  implicit val futureExecContext = ExecutionContext.fromExecutor(
-    ThreadUtils.newDaemonCachedThreadPool("Connection manager future execution context"))
-
-  @volatile
-  private var onReceiveCallback: (BufferMessage, ConnectionManagerId) => Option[Message] = null
-
-  private val authEnabled = securityManager.isAuthenticationEnabled()
-
-  serverChannel.configureBlocking(false)
-  serverChannel.socket.setReuseAddress(true)
-  serverChannel.socket.setReceiveBufferSize(256 * 1024)
-
-  private def startService(port: Int): (ServerSocketChannel, Int) = {
-    serverChannel.socket.bind(new InetSocketAddress(port))
-    (serverChannel, serverChannel.socket.getLocalPort)
-  }
-  Utils.startServiceOnPort[ServerSocketChannel](port, startService, conf, name)
-  serverChannel.register(selector, SelectionKey.OP_ACCEPT)
-
-  val id = new ConnectionManagerId(Utils.localHostName, serverChannel.socket.getLocalPort)
-  logInfo("Bound socket to port " + serverChannel.socket.getLocalPort() + " with id = " + id)
-
-  // used in combination with the ConnectionManagerId to create unique Connection ids
-  // to be able to track asynchronous messages
-  private val idCount: AtomicInteger = new AtomicInteger(1)
-
-  private val writeRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]()
-  private val readRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]()
-
-  @volatile private var isActive = true
-  private val selectorThread = new Thread("connection-manager-thread") {
-    override def run(): Unit = ConnectionManager.this.run()
-  }
-  selectorThread.setDaemon(true)
-  // start this thread last, since it invokes run(), which accesses members above
-  selectorThread.start()
-
-  private def triggerWrite(key: SelectionKey) {
-    val conn = connectionsByKey.getOrElse(key, null)
-    if (conn == null) return
-
-    writeRunnableStarted.synchronized {
-      // So that we do not trigger more write events while processing this one.
-      // The write method will re-register when done.
-      if (conn.changeInterestForWrite()) conn.unregisterInterest()
-      if (writeRunnableStarted.contains(key)) {
-        // key.interestOps(key.interestOps() & ~ SelectionKey.OP_WRITE)
-        return
-      }
-
-      writeRunnableStarted += key
-    }
-    handleReadWriteExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          var register: Boolean = false
-          try {
-            register = conn.write()
-          } finally {
-            writeRunnableStarted.synchronized {
-              writeRunnableStarted -= key
-              val needReregister = register || conn.resetForceReregister()
-              if (needReregister && conn.changeInterestForWrite()) {
-                conn.registerInterest()
-              }
-            }
-          }
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when writing to " + conn.getRemoteConnectionManagerId(), e)
-            conn.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    } )
-  }
-
-
-  private def triggerRead(key: SelectionKey) {
-    val conn = connectionsByKey.getOrElse(key, null)
-    if (conn == null) return
-
-    readRunnableStarted.synchronized {
-      // So that we do not trigger more read events while processing this one.
-      // The read method will re-register when done.
-      if (conn.changeInterestForRead())conn.unregisterInterest()
-      if (readRunnableStarted.contains(key)) {
-        return
-      }
-
-      readRunnableStarted += key
-    }
-    handleReadWriteExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          var register: Boolean = false
-          try {
-            register = conn.read()
-          } finally {
-            readRunnableStarted.synchronized {
-              readRunnableStarted -= key
-              if (register && conn.changeInterestForRead()) {
-                conn.registerInterest()
-              }
-            }
-          }
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when reading from " + conn.getRemoteConnectionManagerId(), e)
-            conn.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    } )
-  }
-
-  private def triggerConnect(key: SelectionKey) {
-    val conn = connectionsByKey.getOrElse(key, null).asInstanceOf[SendingConnection]
-    if (conn == null) return
-
-    // prevent other events from being triggered
-    // Since we are still trying to connect, we do not need to do the additional steps in
-    // triggerWrite
-    conn.changeConnectionKeyInterest(0)
-
-    handleConnectExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          var tries: Int = 10
-          while (tries >= 0) {
-            if (conn.finishConnect(false)) return
-            // Sleep ?
-            Thread.sleep(1)
-            tries -= 1
-          }
-
-          // fallback to previous behavior : we should not really come here since this method was
-          // triggered since channel became connectable : but at times, the first finishConnect need
-          // not succeed : hence the loop to retry a few 'times'.
-          conn.finishConnect(true)
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when finishConnect for " + conn.getRemoteConnectionManagerId(), e)
-            conn.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    } )
-  }
-
-  // MUST be called within selector loop - else deadlock.
-  private def triggerForceCloseByException(key: SelectionKey, e: Exception) {
-    try {
-      key.interestOps(0)
-    } catch {
-      // ignore exceptions
-      case e: Exception => logDebug("Ignoring exception", e)
-    }
-
-    val conn = connectionsByKey.getOrElse(key, null)
-    if (conn == null) return
-
-    // Pushing to connect threadpool
-    handleConnectExecutor.execute(new Runnable {
-      override def run() {
-        try {
-          conn.callOnExceptionCallbacks(e)
-        } catch {
-          // ignore exceptions
-          case NonFatal(e) => logDebug("Ignoring exception", e)
-        }
-        try {
-          conn.close()
-        } catch {
-          // ignore exceptions
-          case NonFatal(e) => logDebug("Ignoring exception", e)
-        }
-      }
-    })
-  }
-
-
-  def run() {
-    try {
-      while (isActive) {
-        while (!registerRequests.isEmpty) {
-          val conn: SendingConnection = registerRequests.dequeue()
-          addListeners(conn)
-          conn.connect()
-          addConnection(conn)
-        }
-
-        while(!keyInterestChangeRequests.isEmpty) {
-          val (key, ops) = keyInterestChangeRequests.dequeue()
-
-          try {
-            if (key.isValid) {
-              val connection = connectionsByKey.getOrElse(key, null)
-              if (connection != null) {
-                val lastOps = key.interestOps()
-                key.interestOps(ops)
-
-                // hot loop - prevent materialization of string if trace not enabled.
-                if (isTraceEnabled()) {
-                  def intToOpStr(op: Int): String = {
-                    val opStrs = ArrayBuffer[String]()
-                    if ((op & SelectionKey.OP_READ) != 0) opStrs += "READ"
-                    if ((op & SelectionKey.OP_WRITE) != 0) opStrs += "WRITE"
-                    if ((op & SelectionKey.OP_CONNECT) != 0) opStrs += "CONNECT"
-                    if ((op & SelectionKey.OP_ACCEPT) != 0) opStrs += "ACCEPT"
-                    if (opStrs.size > 0) opStrs.reduceLeft(_ + " | " + _) else " "
-                  }
-
-                  logTrace("Changed key for connection to [" +
-                    connection.getRemoteConnectionManagerId()  + "] changed from [" +
-                      intToOpStr(lastOps) + "] to [" + intToOpStr(ops) + "]")
-                }
-              }
-            } else {
-              logInfo("Key not valid ? " + key)
-              throw new CancelledKeyException()
-            }
-          } catch {
-            case e: CancelledKeyException => {
-              logInfo("key already cancelled ? " + key, e)
-              triggerForceCloseByException(key, e)
-            }
-            case e: Exception => {
-              logError("Exception processing key " + key, e)
-              triggerForceCloseByException(key, e)
-            }
-          }
-        }
-
-        val selectedKeysCount =
-          try {
-            selector.select()
-          } catch {
-            // Explicitly only dealing with CancelledKeyException here since other exceptions
-            // should be dealt with differently.
-            case e: CancelledKeyException =>
-              // Some keys within the selectors list are invalid/closed. clear them.
-              val allKeys = selector.keys().iterator()
-
-              while (allKeys.hasNext) {
-                val key = allKeys.next()
-                try {
-                  if (! key.isValid) {
-                    logInfo("Key not valid ? " + key)
-                    throw new CancelledKeyException()
-                  }
-                } catch {
-                  case e: CancelledKeyException => {
-                    logInfo("key already cancelled ? " + key, e)
-                    triggerForceCloseByException(key, e)
-                  }
-                  case e: Exception => {
-                    logError("Exception processing key " + key, e)
-                    triggerForceCloseByException(key, e)
-                  }
-                }
-              }
-              0
-
-            case e: ClosedSelectorException =>
-              logDebug("Failed select() as selector is closed.", e)
-              return
-          }
-
-        if (selectedKeysCount == 0) {
-          logDebug("Selector selected " + selectedKeysCount + " of " + selector.keys.size +
-            " keys")
-        }
-        if (selectorThread.isInterrupted) {
-          logInfo("Selector thread was interrupted!")
-          return
-        }
-
-        if (0 != selectedKeysCount) {
-          val selectedKeys = selector.selectedKeys().iterator()
-          while (selectedKeys.hasNext) {
-            val key = selectedKeys.next
-            selectedKeys.remove()
-            try {
-              if (key.isValid) {
-                if (key.isAcceptable) {
-                  acceptConnection(key)
-                } else
-                if (key.isConnectable) {
-                  triggerConnect(key)
-                } else
-                if (key.isReadable) {
-                  triggerRead(key)
-                } else
-                if (key.isWritable) {
-                  triggerWrite(key)
-                }
-              } else {
-                logInfo("Key not valid ? " + key)
-                throw new CancelledKeyException()
-              }
-            } catch {
-              // weird, but we saw this happening - even though key.isValid was true,
-              // key.isAcceptable would throw CancelledKeyException.
-              case e: CancelledKeyException => {
-                logInfo("key already cancelled ? " + key, e)
-                triggerForceCloseByException(key, e)
-              }
-              case e: Exception => {
-                logError("Exception processing key " + key, e)
-                triggerForceCloseByException(key, e)
-              }
-            }
-          }
-        }
-      }
-    } catch {
-      case e: Exception => logError("Error in select loop", e)
-    }
-  }
-
-  def acceptConnection(key: SelectionKey) {
-    val serverChannel = key.channel.asInstanceOf[ServerSocketChannel]
-
-    var newChannel = serverChannel.accept()
-
-    // accept them all in a tight loop. non blocking accept with no processing, should be fine
-    while (newChannel != null) {
-      try {
-        val newConnectionId = new ConnectionId(id, idCount.getAndIncrement.intValue)
-        val newConnection = new ReceivingConnection(newChannel, selector, newConnectionId,
-          securityManager)
-        newConnection.onReceive(receiveMessage)
-        addListeners(newConnection)
-        addConnection(newConnection)
-        logInfo("Accepted connection from [" + newConnection.remoteAddress + "]")
-      } catch {
-        // might happen in case of issues with registering with selector
-        case e: Exception => logError("Error in accept loop", e)
-      }
-
-      newChannel = serverChannel.accept()
-    }
-  }
-
-  private def addListeners(connection: Connection) {
-    connection.onKeyInterestChange(changeConnectionKeyInterest)
-    connection.onException(handleConnectionError)
-    connection.onClose(removeConnection)
-  }
-
-  def addConnection(connection: Connection) {
-    connectionsByKey += ((connection.key, connection))
-  }
-
-  def removeConnection(connection: Connection) {
-    connectionsByKey -= connection.key
-
-    try {
-      connection match {
-        case sendingConnection: SendingConnection =>
-          val sendingConnectionManagerId = sendingConnection.getRemoteConnectionManagerId()
-          logInfo("Removing SendingConnection to " + sendingConnectionManagerId)
-
-          connectionsById -= sendingConnectionManagerId
-          connectionsAwaitingSasl -= connection.connectionId
-
-          messageStatuses.synchronized {
-            messageStatuses.values.filter(_.connectionManagerId == sendingConnectionManagerId)
-              .foreach(status => {
-                logInfo("Notifying " + status)
-                status.failWithoutAck()
-              })
-
-            messageStatuses.retain((i, status) => {
-              status.connectionManagerId != sendingConnectionManagerId
-            })
-          }
-        case receivingConnection: ReceivingConnection =>
-          val remoteConnectionManagerId = receivingConnection.getRemoteConnectionManagerId()
-          logInfo("Removing ReceivingConnection to " + remoteConnectionManagerId)
-
-          val sendingConnectionOpt = connectionsById.get(remoteConnectionManagerId)
-          if (!sendingConnectionOpt.isDefined) {
-            logError(s"Corresponding SendingConnection to ${remoteConnectionManagerId} not found")
-            return
-          }
-
-          val sendingConnection = sendingConnectionOpt.get
-          connectionsById -= remoteConnectionManagerId
-          sendingConnection.close()
-
-          val sendingConnectionManagerId = sendingConnection.getRemoteConnectionManagerId()
-
-          assert(sendingConnectionManagerId == remoteConnectionManagerId)
-
-          messageStatuses.synchronized {
-            for (s <- messageStatuses.values
-                 if s.connectionManagerId == sendingConnectionManagerId) {
-              logInfo("Notifying " + s)
-              s.failWithoutAck()
-            }
-
-            messageStatuses.retain((i, status) => {
-              status.connectionManagerId != sendingConnectionManagerId
-            })
-          }
-        case _ => logError("Unsupported type of connection.")
-      }
-    } finally {
-      // So that the selection keys can be removed.
-      wakeupSelector()
-    }
-  }
-
-  def handleConnectionError(connection: Connection, e: Throwable) {
-    logInfo("Handling connection error on connection to " +
-      connection.getRemoteConnectionManagerId())
-    removeConnection(connection)
-  }
-
-  def changeConnectionKeyInterest(connection: Connection, ops: Int) {
-    keyInterestChangeRequests += ((connection.key, ops))
-    // so that registrations happen !
-    wakeupSelector()
-  }
-
-  def receiveMessage(connection: Connection, message: Message) {
-    val connectionManagerId = ConnectionManagerId.fromSocketAddress(message.senderAddress)
-    logDebug("Received [" + message + "] from [" + connectionManagerId + "]")
-    val runnable = new Runnable() {
-      val creationTime = System.currentTimeMillis
-      def run() {
-        try {
-          logDebug("Handler thread delay is " + (System.currentTimeMillis - creationTime) + " ms")
-          handleMessage(connectionManagerId, message, connection)
-          logDebug("Handling delay is " + (System.currentTimeMillis - creationTime) + " ms")
-        } catch {
-          case NonFatal(e) => {
-            logError("Error when handling messages from " +
-              connection.getRemoteConnectionManagerId(), e)
-            connection.callOnExceptionCallbacks(e)
-          }
-        }
-      }
-    }
-    handleMessageExecutor.execute(runnable)
-    /* handleMessage(connection, message) */
-  }
-
-  private def handleClientAuthentication(
-      waitingConn: SendingConnection,
-      securityMsg: SecurityMessage,
-      connectionId : ConnectionId) {
-    if (waitingConn.isSaslComplete()) {
-      logDebug("Client sasl completed for id: "  + waitingConn.connectionId)
-      connectionsAwaitingSasl -= waitingConn.connectionId
-      waitingConn.registerAfterAuth()
-      wakeupSelector()
-      return
-    } else {
-      var replyToken : Array[Byte] = null
-      try {
-        replyToken = waitingConn.sparkSaslClient.response(securityMsg.getToken)
-        if (waitingConn.isSaslComplete()) {
-          logDebug("Client sasl completed after evaluate for id: " + waitingConn.connectionId)
-          connectionsAwaitingSasl -= waitingConn.connectionId
-          waitingConn.registerAfterAuth()
-          wakeupSelector()
-          return
-        }
-        val securityMsgResp = SecurityMessage.fromResponse(replyToken,
-          securityMsg.getConnectionId.toString)
-        val message = securityMsgResp.toBufferMessage
-        if (message == null) throw new IOException("Error creating security message")
-        sendSecurityMessage(waitingConn.getRemoteConnectionManagerId(), message)
-      } catch {
-        case e: Exception =>
-          logError("Error handling sasl client authentication", e)
-          waitingConn.close()
-          throw new IOException("Error evaluating sasl response: ", e)
-      }
-    }
-  }
-
-  private def handleServerAuthentication(
-      connection: Connection,
-      securityMsg: SecurityMessage,
-      connectionId: ConnectionId) {
-    if (!connection.isSaslComplete()) {
-      logDebug("saslContext not established")
-      var replyToken : Array[Byte] = null
-      try {
-        connection.synchronized {
-          if (connection.sparkSaslServer == null) {
-            logDebug("Creating sasl Server")
-            connection.sparkSaslServer = new SparkSaslServer(conf.getAppId, securityManager, false)
-          }
-        }
-        replyToken = connection.sparkSaslServer.response(securityMsg.getToken)
-        if (connection.isSaslComplete()) {
-          logDebug("Server sasl completed: " + connection.connectionId +
-            " for: " + connectionId)
-        } else {
-          logDebug("Server sasl not completed: " + connection.connectionId +
-            " for: " + connectionId)
-        }
-        if (replyToken != null) {
-          val securityMsgResp = SecurityMessage.fromResponse(replyToken,
-            securityMsg.getConnectionId)
-          val message = securityMsgResp.toBufferMessage
-          if (message == null) throw new Exception("Error creating security Message")
-          sendSecurityMessage(connection.getRemoteConnectionManagerId(), message)
-        }
-      } catch {
-        case e: Exception => {
-          logError("Error in server auth negotiation: " + e)
-          // It would probably be better to send an error message telling other side auth failed
-          // but for now just close
-          connection.close()
-        }
-      }
-    } else {
-      logDebug("connection already established for this connection id: " + connection.connectionId)
-    }
-  }
-
-
-  private def handleAuthentication(conn: Connection, bufferMessage: BufferMessage): Boolean = {
-    if (bufferMessage.isSecurityNeg) {
-      logDebug("This is security neg message")
-
-      // parse as SecurityMessage
-      val securityMsg = SecurityMessage.fromBufferMessage(bufferMessage)
-      val connectionId = ConnectionId.createConnectionIdFromString(securityMsg.getConnectionId)
-
-      connectionsAwaitingSasl.get(connectionId) match {
-        case Some(waitingConn) => {
-          // Client - this must be in response to us doing Send
-          logDebug("Client handleAuth for id: " +  waitingConn.connectionId)
-          handleClientAuthentication(waitingConn, securityMsg, connectionId)
-        }
-        case None => {
-          // Server - someone sent us something and we haven't authenticated yet
-          logDebug("Server handleAuth for id: " + connectionId)
-          handleServerAuthentication(conn, securityMsg, connectionId)
-        }
-      }
-      return true
-    } else {
-      if (!conn.isSaslComplete()) {
-        // We could handle this better and tell the client we need to do authentication
-        // negotiation, but for now just ignore them.
-        logError("message sent that is not security negotiation message on connection " +
-                 "not authenticated yet, ignoring it!!")
-        return true
-      }
-    }
-    false
-  }
-
-  private def handleMessage(
-      connectionManagerId: ConnectionManagerId,
-      message: Message,
-      connection: Connection) {
-    logDebug("Handling [" + message + "] from [" + connectionManagerId + "]")
-    message match {
-      case bufferMessage: BufferMessage => {
-        if (authEnabled) {
-          val res = handleAuthentication(connection, bufferMessage)
-          if (res) {
-            // message was security negotiation so skip the rest
-            logDebug("After handleAuth result was true, returning")
-            return
-          }
-        }
-        if (bufferMessage.hasAckId()) {
-          messageStatuses.synchronized {
-            messageStatuses.get(bufferMessage.ackId) match {
-              case Some(status) => {
-                messageStatuses -= bufferMessage.ackId
-                status.success(message)
-              }
-              case None => {
-                /**
-                 * We can fall down on this code because of following 2 cases
-                 *
-                 * (1) Invalid ack sent due to buggy code.
-                 *
-                 * (2) Late-arriving ack for a SendMessageStatus
-                 *     To avoid unwilling late-arriving ack
-                 *     caused by long pause like GC, you can set
-                 *     larger value than default to spark.core.connection.ack.wait.timeout
-                 */
-                logWarning(s"Could not find reference for received ack Message ${message.id}")
-              }
-            }
-          }
-        } else {
-          var ackMessage : Option[Message] = None
-          try {
-            ackMessage = if (onReceiveCallback != null) {
-              logDebug("Calling back")
-              onReceiveCallback(bufferMessage, connectionManagerId)
-            } else {
-              logDebug("Not calling back as callback is null")
-              None
-            }
-
-            if (ackMessage.isDefined) {
-              if (!ackMessage.get.isInstanceOf[BufferMessage]) {
-                logDebug("Response to " + bufferMessage + " is not a buffer message, it is of type "
-                  + ackMessage.get.getClass)
-              } else if (!ackMessage.get.asInstanceOf[BufferMessage].hasAckId) {
-                logDebug("Response to " + bufferMessage + " does not have ack id set")
-                ackMessage.get.asInstanceOf[BufferMessage].ackId = bufferMessage.id
-              }
-            }
-          } catch {
-            case e: Exception => {
-              logError(s"Exception was thrown while processing message", e)
-              ackMessage = Some(Message.createErrorMessage(e, bufferMessage.id))
-            }
-          } finally {
-            sendMessage(connectionManagerId, ackMessage.getOrElse {
-              Message.createBufferMessage(bufferMessage.id)
-            })
-          }
-        }
-      }
-      case _ => throw new Exception("Unknown type message received")
-    }
-  }
-
-  private def checkSendAuthFirst(connManagerId: ConnectionManagerId, conn: SendingConnection) {
-    // see if we need to do sasl before writing
-    // this should only be the first negotiation as the Client!!!
-    if (!conn.isSaslComplete()) {
-      conn.synchronized {
-        if (conn.sparkSaslClient == null) {
-          conn.sparkSaslClient = new SparkSaslClient(conf.getAppId, securityManager, false)
-          var firstResponse: Array[Byte] = null
-          try {
-            firstResponse = conn.sparkSaslClient.firstToken()
-            val securityMsg = SecurityMessage.fromResponse(firstResponse,
-              conn.connectionId.toString())
-            val message = securityMsg.toBufferMessage
-            if (message == null) throw new Exception("Error creating security message")
-            connectionsAwaitingSasl += ((conn.connectionId, conn))
-            sendSecurityMessage(connManagerId, message)
-            logDebug("adding connectionsAwaitingSasl id: " + conn.connectionId +
-              " to: " + connManagerId)
-          } catch {
-            case e: Exception => {
-              logError("Error getting first response from the SaslClient.", e)
-              conn.close()
-              throw new Exception("Error getting first response from the SaslClient")
-            }
-          }
-        }
-      }
-    } else {
-      logDebug("Sasl already established ")
-    }
-  }
-
-  // allow us to add messages to the inbox for doing sasl negotiating
-  private def sendSecurityMessage(connManagerId: ConnectionManagerId, message: Message) {
-    def startNewConnection(): SendingConnection = {
-      val inetSocketAddress = new InetSocketAddress(connManagerId.host, connManagerId.port)
-      val newConnectionId = new ConnectionId(id, idCount.getAndIncrement.intValue)
-      val newConnection = new SendingConnection(inetSocketAddress, selector, connManagerId,
-        newConnectionId, securityManager)
-      logInfo("creating new sending connection for security! " + newConnectionId )
-      registerRequests.enqueue(newConnection)
-
-      newConnection
-    }
-    // I removed the lookupKey stuff as part of merge ... should I re-add it ?
-    // We did not find it useful in our test-env ...
-    // If we do re-add it, we should consistently use it everywhere I guess ?
-    message.senderAddress = id.toSocketAddress()
-    logTrace("Sending Security [" + message + "] to [" + connManagerId + "]")
-    val connection = connectionsById.getOrElseUpdate(connManagerId, startNewConnection())
-
-    // send security message until going connection has been authenticated
-    connection.send(message)
-
-    wakeupSelector()
-  }
-
-  private def sendMessage(connectionManagerId: ConnectionManagerId, message: Message) {
-    def startNewConnection(): SendingConnection = {
-      val inetSocketAddress = new InetSocketAddress(connectionManagerId.host,
-        connectionManagerId.port)
-      val newConnectionId = new ConnectionId(id, idCount.getAndIncrement.intValue)
-      val newConnection = new SendingConnection(inetSocketAddress, selector, connectionManagerId,
-        newConnectionId, securityManager)
-      newConnection.onException {
-        case (conn, e) => {
-          logError("Exception while sending message.", e)
-          reportSendingMessageFailure(message.id, e)
-        }
-      }
-      logTrace("creating new sending connection: " + newConnectionId)
-      registerRequests.enqueue(newConnection)
-
-      newConnection
-    }
-    val connection = connectionsById.getOrElseUpdate(connectionManagerId, startNewConnection())
-
-    message.senderAddress = id.toSocketAddress()
-    logDebug("Before Sending [" + message + "] to [" + connectionManagerId + "]" + " " +
-      "connectionid: "  + connection.connectionId)
-
-    if (authEnabled) {
-      try {
-        checkSendAuthFirst(connectionManagerId, connection)
-      } catch {
-        case NonFatal(e) => {
-          reportSendingMessageFailure(message.id, e)
-        }
-      }
-    }
-    logDebug("Sending [" + message + "] to [" + connectionManagerId + "]")
-    connection.send(message)
-    wakeupSelector()
-  }
-
-  private def reportSendingMessageFailure(messageId: Int, e: Throwable): Unit = {
-    // need to tell sender it failed
-    messageStatuses.synchronized {
-      val s = messageStatuses.get(messageId)
-      s match {
-        case Some(msgStatus) => {
-          messageStatuses -= messageId
-          logInfo("Notifying " + msgStatus.connectionManagerId)
-          msgStatus.failure(e)
-        }
-        case None => {
-          logError("no messageStatus for failed message id: " + messageId)
-        }
-      }
-    }
-  }
-
-  private def wakeupSelector() {
-    selector.wakeup()
-  }
-
-  /**
-   * Send a message and block until an acknowledgment is received or an error occurs.
-   * @param connectionManagerId the message's destination
-   * @param message the message being sent
-   * @return a Future that either returns the acknowledgment message or captures an exception.
-   */
-  def sendMessageReliably(connectionManagerId: ConnectionManagerId, message: Message)
-      : Future[Message] = {
-    val promise = Promise[Message]()
-
-    // It's important that the TimerTask doesn't capture a reference to `message`, which can cause
-    // memory leaks since cancelled TimerTasks won't necessarily be garbage collected until the time
-    // at which they would originally be scheduled to run.  Therefore, extract the message id
-    // from outside of the TimerTask closure (see SPARK-4393 for more context).
-    val messageId = message.id
-    // Keep a weak reference to the promise so that the completed promise may be garbage-collected
-    val promiseReference = new WeakReference(promise)
-    val timeoutTask: TimerTask = new TimerTask {
-      override def run(timeout: Timeout): Unit = {
-        messageStatuses.synchronized {
-          messageStatuses.remove(messageId).foreach { s =>
-            val e = new IOException("sendMessageReliably failed because ack " +
-              s"was not received within $ackTimeout sec")
-            val p = promiseReference.get
-            if (p != null) {
-              // Attempt to fail the promise with a Timeout exception
-              if (!p.tryFailure(e)) {
-                // If we reach here, then someone else has already signalled success or failure
-                // on this promise, so log a warning:
-                logError("Ignore error because promise is completed", e)
-              }
-            } else {
-              // The WeakReference was empty, which should never happen because
-              // sendMessageReliably's caller should have a strong reference to promise.future;
-              logError("Promise was garbage collected; this should never happen!", e)
-            }
-          }
-        }
-      }
-    }
-
-    val timeoutTaskHandle = ackTimeoutMonitor.newTimeout(timeoutTask, ackTimeout, TimeUnit.SECONDS)
-
-    val status = new MessageStatus(message, connectionManagerId, s => {
-      timeoutTaskHandle.cancel()
-      s match {
-        case scala.util.Failure(e) =>
-          // Indicates a failure where we either never sent or never got ACK'd
-          if (!promise.tryFailure(e)) {
-            logWarning("Ignore error because promise is completed", e)
-          }
-        case scala.util.Success(ackMessage) =>
-          if (ackMessage.hasError) {
-            val errorMsgByteBuf = ackMessage.asInstanceOf[BufferMessage].buffers.head
-            val errorMsgBytes = new Array[Byte](errorMsgByteBuf.limit())
-            errorMsgByteBuf.get(errorMsgBytes)
-            val errorMsg = new String(errorMsgBytes, UTF_8)
-            val e = new IOException(
-              s"sendMessageReliably failed with ACK that signalled a remote error: $errorMsg")
-            if (!promise.tryFailure(e)) {
-              logWarning("Ignore error because promise is completed", e)
-            }
-          } else {
-            if (!promise.trySuccess(ackMessage)) {
-              logWarning("Drop ackMessage because promise is completed")
-            }
-          }
-      }
-    })
-    messageStatuses.synchronized {
-      messageStatuses += ((message.id, status))
-    }
-
-    sendMessage(connectionManagerId, message)
-    promise.future
-  }
-
-  def onReceiveMessage(callback: (Message, ConnectionManagerId) => Option[Message]) {
-    onReceiveCallback = callback
-  }
-
-  def stop() {
-    isActive = false
-    ackTimeoutMonitor.stop()
-    selector.close()
-    selectorThread.interrupt()
-    selectorThread.join()
-    val connections = connectionsByKey.values
-    connections.foreach(_.close())
-    if (connectionsByKey.size != 0) {
-      logWarning("All connections not cleaned up")
-    }
-    handleMessageExecutor.shutdown()
-    handleReadWriteExecutor.shutdown()
-    handleConnectExecutor.shutdown()
-    logInfo("ConnectionManager stopped")
-  }
-}
-
-
-private[spark] object ConnectionManager {
-  import scala.concurrent.ExecutionContext.Implicits.global
-
-  def main(args: Array[String]) {
-    val conf = new SparkConf
-    val manager = new ConnectionManager(9999, conf, new SecurityManager(conf))
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      // scalastyle:off println
-      println("Received [" + msg + "] from [" + id + "]")
-      // scalastyle:on println
-      None
-    })
-
-    /* testSequentialSending(manager) */
-    /* System.gc() */
-
-    /* testParallelSending(manager) */
-    /* System.gc() */
-
-    /* testParallelDecreasingSending(manager) */
-    /* System.gc() */
-
-    testContinuousSending(manager)
-    System.gc()
-  }
-
-  // scalastyle:off println
-  def testSequentialSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Sequential Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      Await.result(manager.sendMessageReliably(manager.id, bufferMessage), Duration.Inf)
-    })
-    println("--------------------------")
-    println()
-  }
-
-  def testParallelSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Parallel Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    val startTime = System.currentTimeMillis
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      manager.sendMessageReliably(manager.id, bufferMessage)
-    }).foreach(f => {
-      f.onFailure {
-        case e => println("Failed due to " + e)
-      }
-      Await.ready(f, 1 second)
-    })
-    val finishTime = System.currentTimeMillis
-
-    val mb = size * count / 1024.0 / 1024.0
-    val ms = finishTime - startTime
-    val tput = mb * 1000.0 / ms
-    println("--------------------------")
-    println("Started at " + startTime + ", finished at " + finishTime)
-    println("Sent " + count + " messages of size " + size + " in " + ms + " ms " +
-      "(" + tput + " MB/s)")
-    println("--------------------------")
-    println()
-  }
-
-  def testParallelDecreasingSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Parallel Decreasing Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-    val buffers = Array.tabulate(count) { i =>
-      val bufferLen = size * (i + 1)
-      val bufferContent = Array.tabulate[Byte](bufferLen)(x => x.toByte)
-      ByteBuffer.allocate(bufferLen).put(bufferContent)
-    }
-    buffers.foreach(_.flip)
-    val mb = buffers.map(_.remaining).reduceLeft(_ + _) / 1024.0 / 1024.0
-
-    val startTime = System.currentTimeMillis
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffers(count - 1 - i).duplicate)
-      manager.sendMessageReliably(manager.id, bufferMessage)
-    }).foreach(f => {
-      f.onFailure {
-        case e => println("Failed due to " + e)
-      }
-      Await.ready(f, 1 second)
-    })
-    val finishTime = System.currentTimeMillis
-
-    val ms = finishTime - startTime
-    val tput = mb * 1000.0 / ms
-    println("--------------------------")
-    /* println("Started at " + startTime + ", finished at " + finishTime) */
-    println("Sent " + mb + " MB in " + ms + " ms (" + tput + " MB/s)")
-    println("--------------------------")
-    println()
-  }
-
-  def testContinuousSending(manager: ConnectionManager) {
-    println("--------------------------")
-    println("Continuous Sending")
-    println("--------------------------")
-    val size = 10 * 1024 * 1024
-    val count = 10
-
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    val startTime = System.currentTimeMillis
-    while(true) {
-      (0 until count).map(i => {
-          val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-          manager.sendMessageReliably(manager.id, bufferMessage)
-        }).foreach(f => {
-          f.onFailure {
-            case e => println("Failed due to " + e)
-          }
-          Await.ready(f, 1 second)
-        })
-      val finishTime = System.currentTimeMillis
-      Thread.sleep(1000)
-      val mb = size * count / 1024.0 / 1024.0
-      val ms = finishTime - startTime
-      val tput = mb * 1000.0 / ms
-      println("Sent " + mb + " MB in " + ms + " ms (" + tput + " MB/s)")
-      println("--------------------------")
-      println()
-    }
-  }
-  // scalastyle:on println
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Message.scala b/core/src/main/scala/org/apache/spark/network/nio/Message.scala
deleted file mode 100644
index 85d2fe2bf9c20..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/Message.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net.InetSocketAddress
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.ArrayBuffer
-
-import com.google.common.base.Charsets.UTF_8
-
-import org.apache.spark.util.Utils
-
-private[nio] abstract class Message(val typ: Long, val id: Int) {
-  var senderAddress: InetSocketAddress = null
-  var started = false
-  var startTime = -1L
-  var finishTime = -1L
-  var isSecurityNeg = false
-  var hasError = false
-
-  def size: Int
-
-  def getChunkForSending(maxChunkSize: Int): Option[MessageChunk]
-
-  def getChunkForReceiving(chunkSize: Int): Option[MessageChunk]
-
-  def timeTaken(): String = (finishTime - startTime).toString + " ms"
-
-  override def toString: String = {
-    this.getClass.getSimpleName + "(id = " + id + ", size = " + size + ")"
-  }
-}
-
-
-private[nio] object Message {
-  val BUFFER_MESSAGE = 1111111111L
-
-  var lastId = 1
-
-  def getNewId(): Int = synchronized {
-    lastId += 1
-    if (lastId == 0) {
-      lastId += 1
-    }
-    lastId
-  }
-
-  def createBufferMessage(dataBuffers: Seq[ByteBuffer], ackId: Int): BufferMessage = {
-    if (dataBuffers == null) {
-      return new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer], ackId)
-    }
-    if (dataBuffers.exists(_ == null)) {
-      throw new Exception("Attempting to create buffer message with null buffer")
-    }
-    new BufferMessage(getNewId(), new ArrayBuffer[ByteBuffer] ++= dataBuffers, ackId)
-  }
-
-  def createBufferMessage(dataBuffers: Seq[ByteBuffer]): BufferMessage =
-    createBufferMessage(dataBuffers, 0)
-
-  def createBufferMessage(dataBuffer: ByteBuffer, ackId: Int): BufferMessage = {
-    if (dataBuffer == null) {
-      createBufferMessage(Array(ByteBuffer.allocate(0)), ackId)
-    } else {
-      createBufferMessage(Array(dataBuffer), ackId)
-    }
-  }
-
-  def createBufferMessage(dataBuffer: ByteBuffer): BufferMessage =
-    createBufferMessage(dataBuffer, 0)
-
-  def createBufferMessage(ackId: Int): BufferMessage = {
-    createBufferMessage(new Array[ByteBuffer](0), ackId)
-  }
-
-  /**
-   * Create a "negative acknowledgment" to notify a sender that an error occurred
-   * while processing its message.  The exception's stacktrace will be formatted
-   * as a string, serialized into a byte array, and sent as the message payload.
-   */
-  def createErrorMessage(exception: Exception, ackId: Int): BufferMessage = {
-    val exceptionString = Utils.exceptionString(exception)
-    val serializedExceptionString = ByteBuffer.wrap(exceptionString.getBytes(UTF_8))
-    val errorMessage = createBufferMessage(serializedExceptionString, ackId)
-    errorMessage.hasError = true
-    errorMessage
-  }
-
-  def create(header: MessageChunkHeader): Message = {
-    val newMessage: Message = header.typ match {
-      case BUFFER_MESSAGE => new BufferMessage(header.id,
-        ArrayBuffer(ByteBuffer.allocate(header.totalSize)), header.other)
-    }
-    newMessage.hasError = header.hasError
-    newMessage.senderAddress = header.address
-    newMessage
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala b/core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala
deleted file mode 100644
index 7b3da4bb9d5ee..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/MessageChunkHeader.scala
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.net.{InetAddress, InetSocketAddress}
-import java.nio.ByteBuffer
-
-private[nio] class MessageChunkHeader(
-    val typ: Long,
-    val id: Int,
-    val totalSize: Int,
-    val chunkSize: Int,
-    val other: Int,
-    val hasError: Boolean,
-    val securityNeg: Int,
-    val address: InetSocketAddress) {
-  lazy val buffer = {
-    // No need to change this, at 'use' time, we do a reverse lookup of the hostname.
-    // Refer to network.Connection
-    val ip = address.getAddress.getAddress()
-    val port = address.getPort()
-    ByteBuffer.
-      allocate(MessageChunkHeader.HEADER_SIZE).
-      putLong(typ).
-      putInt(id).
-      putInt(totalSize).
-      putInt(chunkSize).
-      putInt(other).
-      put(if (hasError) 1.asInstanceOf[Byte] else 0.asInstanceOf[Byte]).
-      putInt(securityNeg).
-      putInt(ip.size).
-      put(ip).
-      putInt(port).
-      position(MessageChunkHeader.HEADER_SIZE).
-      flip.asInstanceOf[ByteBuffer]
-  }
-
-  override def toString: String = {
-    "" + this.getClass.getSimpleName + ":" + id + " of type " + typ +
-      " and sizes " + totalSize + " / " + chunkSize + " bytes, securityNeg: " + securityNeg
-  }
-
-}
-
-
-private[nio] object MessageChunkHeader {
-  val HEADER_SIZE = 45
-
-  def create(buffer: ByteBuffer): MessageChunkHeader = {
-    if (buffer.remaining != HEADER_SIZE) {
-      throw new IllegalArgumentException("Cannot convert buffer data to Message")
-    }
-    val typ = buffer.getLong()
-    val id = buffer.getInt()
-    val totalSize = buffer.getInt()
-    val chunkSize = buffer.getInt()
-    val other = buffer.getInt()
-    val hasError = buffer.get() != 0
-    val securityNeg = buffer.getInt()
-    val ipSize = buffer.getInt()
-    val ipBytes = new Array[Byte](ipSize)
-    buffer.get(ipBytes)
-    val ip = InetAddress.getByAddress(ipBytes)
-    val port = buffer.getInt()
-    new MessageChunkHeader(typ, id, totalSize, chunkSize, other, hasError, securityNeg,
-      new InetSocketAddress(ip, port))
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala
deleted file mode 100644
index b2aec160635c7..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import org.apache.spark.network._
-import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
-import org.apache.spark.network.shuffle.BlockFetchingListener
-import org.apache.spark.storage.{BlockId, StorageLevel}
-import org.apache.spark.util.Utils
-import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
-
-import scala.concurrent.Future
-
-
-/**
- * A [[BlockTransferService]] implementation based on [[ConnectionManager]], a custom
- * implementation using Java NIO.
- */
-final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityManager)
-  extends BlockTransferService with Logging {
-
-  private var cm: ConnectionManager = _
-
-  private var blockDataManager: BlockDataManager = _
-
-  /**
-   * Port number the service is listening on, available only after [[init]] is invoked.
-   */
-  override def port: Int = {
-    checkInit()
-    cm.id.port
-  }
-
-  /**
-   * Host name the service is listening on, available only after [[init]] is invoked.
-   */
-  override def hostName: String = {
-    checkInit()
-    cm.id.host
-  }
-
-  /**
-   * Initialize the transfer service by giving it the BlockDataManager that can be used to fetch
-   * local blocks or put local blocks.
-   */
-  override def init(blockDataManager: BlockDataManager): Unit = {
-    this.blockDataManager = blockDataManager
-    cm = new ConnectionManager(
-      conf.getInt("spark.blockManager.port", 0),
-      conf,
-      securityManager,
-      "Connection manager for block manager")
-    cm.onReceiveMessage(onBlockMessageReceive)
-  }
-
-  /**
-   * Tear down the transfer service.
-   */
-  override def close(): Unit = {
-    if (cm != null) {
-      cm.stop()
-    }
-  }
-
-  override def fetchBlocks(
-      host: String,
-      port: Int,
-      execId: String,
-      blockIds: Array[String],
-      listener: BlockFetchingListener): Unit = {
-    checkInit()
-
-    val cmId = new ConnectionManagerId(host, port)
-    val blockMessageArray = new BlockMessageArray(blockIds.map { blockId =>
-      BlockMessage.fromGetBlock(GetBlock(BlockId(blockId)))
-    })
-
-    val future = cm.sendMessageReliably(cmId, blockMessageArray.toBufferMessage)
-
-    // Register the listener on success/failure future callback.
-    future.onSuccess { case message =>
-      val bufferMessage = message.asInstanceOf[BufferMessage]
-      val blockMessageArray = BlockMessageArray.fromBufferMessage(bufferMessage)
-
-      // SPARK-4064: In some cases(eg. Remote block was removed) blockMessageArray may be empty.
-      if (blockMessageArray.isEmpty) {
-        blockIds.foreach { id =>
-          listener.onBlockFetchFailure(id, new SparkException(s"Received empty message from $cmId"))
-        }
-      } else {
-        for (blockMessage: BlockMessage <- blockMessageArray) {
-          val msgType = blockMessage.getType
-          if (msgType != BlockMessage.TYPE_GOT_BLOCK) {
-            if (blockMessage.getId != null) {
-              listener.onBlockFetchFailure(blockMessage.getId.toString,
-                new SparkException(s"Unexpected message $msgType received from $cmId"))
-            }
-          } else {
-            val blockId = blockMessage.getId
-            val networkSize = blockMessage.getData.limit()
-            listener.onBlockFetchSuccess(
-              blockId.toString, new NioManagedBuffer(blockMessage.getData))
-          }
-        }
-      }
-    }(cm.futureExecContext)
-
-    future.onFailure { case exception =>
-      blockIds.foreach { blockId =>
-        listener.onBlockFetchFailure(blockId, exception)
-      }
-    }(cm.futureExecContext)
-  }
-
-  /**
-   * Upload a single block to a remote node, available only after [[init]] is invoked.
-   *
-   * This call blocks until the upload completes, or throws an exception upon failures.
-   */
-  override def uploadBlock(
-      hostname: String,
-      port: Int,
-      execId: String,
-      blockId: BlockId,
-      blockData: ManagedBuffer,
-      level: StorageLevel)
-    : Future[Unit] = {
-    checkInit()
-    val msg = PutBlock(blockId, blockData.nioByteBuffer(), level)
-    val blockMessageArray = new BlockMessageArray(BlockMessage.fromPutBlock(msg))
-    val remoteCmId = new ConnectionManagerId(hostName, port)
-    val reply = cm.sendMessageReliably(remoteCmId, blockMessageArray.toBufferMessage)
-    reply.map(x => ())(cm.futureExecContext)
-  }
-
-  private def checkInit(): Unit = if (cm == null) {
-    throw new IllegalStateException(getClass.getName + " has not been initialized")
-  }
-
-  private def onBlockMessageReceive(msg: Message, id: ConnectionManagerId): Option[Message] = {
-    logDebug("Handling message " + msg)
-    msg match {
-      case bufferMessage: BufferMessage =>
-        try {
-          logDebug("Handling as a buffer message " + bufferMessage)
-          val blockMessages = BlockMessageArray.fromBufferMessage(bufferMessage)
-          logDebug("Parsed as a block message array")
-          val responseMessages = blockMessages.map(processBlockMessage).filter(_ != None).map(_.get)
-          Some(new BlockMessageArray(responseMessages).toBufferMessage)
-        } catch {
-          case e: Exception =>
-            logError("Exception handling buffer message", e)
-            Some(Message.createErrorMessage(e, msg.id))
-        }
-
-      case otherMessage: Any =>
-        val errorMsg = s"Received unknown message type: ${otherMessage.getClass.getName}"
-        logError(errorMsg)
-        Some(Message.createErrorMessage(new UnsupportedOperationException(errorMsg), msg.id))
-    }
-  }
-
-  private def processBlockMessage(blockMessage: BlockMessage): Option[BlockMessage] = {
-    blockMessage.getType match {
-      case BlockMessage.TYPE_PUT_BLOCK =>
-        val msg = PutBlock(blockMessage.getId, blockMessage.getData, blockMessage.getLevel)
-        logDebug("Received [" + msg + "]")
-        putBlock(msg.id, msg.data, msg.level)
-        None
-
-      case BlockMessage.TYPE_GET_BLOCK =>
-        val msg = new GetBlock(blockMessage.getId)
-        logDebug("Received [" + msg + "]")
-        val buffer = getBlock(msg.id)
-        if (buffer == null) {
-          return None
-        }
-        Some(BlockMessage.fromGotBlock(GotBlock(msg.id, buffer)))
-
-      case _ => None
-    }
-  }
-
-  private def putBlock(blockId: BlockId, bytes: ByteBuffer, level: StorageLevel) {
-    val startTimeMs = System.currentTimeMillis()
-    logDebug("PutBlock " + blockId + " started from " + startTimeMs + " with data: " + bytes)
-    blockDataManager.putBlockData(blockId, new NioManagedBuffer(bytes), level)
-    logDebug("PutBlock " + blockId + " used " + Utils.getUsedTimeMs(startTimeMs)
-      + " with data size: " + bytes.limit)
-  }
-
-  private def getBlock(blockId: BlockId): ByteBuffer = {
-    val startTimeMs = System.currentTimeMillis()
-    logDebug("GetBlock " + blockId + " started from " + startTimeMs)
-    val buffer = blockDataManager.getBlockData(blockId)
-    logDebug("GetBlock " + blockId + " used " + Utils.getUsedTimeMs(startTimeMs)
-      + " and got buffer " + buffer)
-    buffer.nioByteBuffer()
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
deleted file mode 100644
index 232c552f9865d..0000000000000
--- a/core/src/main/scala/org/apache/spark/network/nio/SecurityMessage.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.nio.ByteBuffer
-
-import scala.collection.mutable.{ArrayBuffer, StringBuilder}
-
-import org.apache.spark._
-
-/**
- * SecurityMessage is class that contains the connectionId and sasl token
- * used in SASL negotiation. SecurityMessage has routines for converting
- * it to and from a BufferMessage so that it can be sent by the ConnectionManager
- * and easily consumed by users when received.
- * The api was modeled after BlockMessage.
- *
- * The connectionId is the connectionId of the client side. Since
- * message passing is asynchronous and its possible for the server side (receiving)
- * to get multiple different types of messages on the same connection the connectionId
- * is used to know which connnection the security message is intended for.
- *
- * For instance, lets say we are node_0. We need to send data to node_1. The node_0 side
- * is acting as a client and connecting to node_1. SASL negotiation has to occur
- * between node_0 and node_1 before node_1 trusts node_0 so node_0 sends a security message.
- * node_1 receives the message from node_0 but before it can process it and send a response,
- * some thread on node_1 decides it needs to send data to node_0 so it connects to node_0
- * and sends a security message of its own to authenticate as a client. Now node_0 gets
- * the message and it needs to decide if this message is in response to it being a client
- * (from the first send) or if its just node_1 trying to connect to it to send data.  This
- * is where the connectionId field is used. node_0 can lookup the connectionId to see if
- * it is in response to it being a client or if its in response to someone sending other data.
- *
- * The format of a SecurityMessage as its sent is:
- *   - Length of the ConnectionId
- *   - ConnectionId
- *   - Length of the token
- *   - Token
- */
-private[nio] class SecurityMessage extends Logging {
-
-  private var connectionId: String = null
-  private var token: Array[Byte] = null
-
-  def set(byteArr: Array[Byte], newconnectionId: String) {
-    if (byteArr == null) {
-      token = new Array[Byte](0)
-    } else {
-      token = byteArr
-    }
-    connectionId = newconnectionId
-  }
-
-  /**
-   * Read the given buffer and set the members of this class.
-   */
-  def set(buffer: ByteBuffer) {
-    val idLength = buffer.getInt()
-    val idBuilder = new StringBuilder(idLength)
-    for (i <- 1 to idLength) {
-        idBuilder += buffer.getChar()
-    }
-    connectionId = idBuilder.toString()
-
-    val tokenLength = buffer.getInt()
-    token = new Array[Byte](tokenLength)
-    if (tokenLength > 0) {
-      buffer.get(token, 0, tokenLength)
-    }
-  }
-
-  def set(bufferMsg: BufferMessage) {
-    val buffer = bufferMsg.buffers.apply(0)
-    buffer.clear()
-    set(buffer)
-  }
-
-  def getConnectionId: String = {
-    return connectionId
-  }
-
-  def getToken: Array[Byte] = {
-    return token
-  }
-
-  /**
-   * Create a BufferMessage that can be sent by the ConnectionManager containing
-   * the security information from this class.
-   * @return BufferMessage
-   */
-  def toBufferMessage: BufferMessage = {
-    val buffers = new ArrayBuffer[ByteBuffer]()
-
-    // 4 bytes for the length of the connectionId
-    // connectionId is of type char so multiple the length by 2 to get number of bytes
-    // 4 bytes for the length of token
-    // token is a byte buffer so just take the length
-    var buffer = ByteBuffer.allocate(4 + connectionId.length() * 2 + 4 + token.length)
-    buffer.putInt(connectionId.length())
-    connectionId.foreach((x: Char) => buffer.putChar(x))
-    buffer.putInt(token.length)
-
-    if (token.length > 0) {
-      buffer.put(token)
-    }
-    buffer.flip()
-    buffers += buffer
-
-    var message = Message.createBufferMessage(buffers)
-    logDebug("message total size is : " + message.size)
-    message.isSecurityNeg = true
-    return message
-  }
-
-  override def toString: String = {
-    "SecurityMessage [connId= " + connectionId + ", Token = " + token + "]"
-  }
-}
-
-private[nio] object SecurityMessage {
-
-  /**
-   * Convert the given BufferMessage to a SecurityMessage by parsing the contents
-   * of the BufferMessage and populating the SecurityMessage fields.
-   * @param bufferMessage is a BufferMessage that was received
-   * @return new SecurityMessage
-   */
-  def fromBufferMessage(bufferMessage: BufferMessage): SecurityMessage = {
-    val newSecurityMessage = new SecurityMessage()
-    newSecurityMessage.set(bufferMessage)
-    newSecurityMessage
-  }
-
-  /**
-   * Create a SecurityMessage to send from a given saslResponse.
-   * @param response is the response to a challenge from the SaslClient or Saslserver
-   * @param connectionId the client connectionId we are negotiation authentication for
-   * @return a new SecurityMessage
-   */
-  def fromResponse(response : Array[Byte], connectionId : String) : SecurityMessage = {
-    val newSecurityMessage = new SecurityMessage()
-    newSecurityMessage.set(response, connectionId)
-    newSecurityMessage
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala
index 8ae76c5f72f2e..7515aad09db73 100644
--- a/core/src/main/scala/org/apache/spark/package.scala
+++ b/core/src/main/scala/org/apache/spark/package.scala
@@ -43,5 +43,5 @@ package org.apache
 
 package object spark {
   // For package docs only
-  val SPARK_VERSION = "1.5.0-SNAPSHOT"
+  val SPARK_VERSION = "1.6.0-SNAPSHOT"
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 1f755db485812..aedced7408cde 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -28,12 +28,13 @@ private[spark] class BinaryFileRDD[T](
     inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
     keyClass: Class[String],
     valueClass: Class[T],
-    @transient conf: Configuration,
+    conf: Configuration,
     minPartitions: Int)
   extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {
 
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
+    val conf = getConf
     inputFormat match {
       case configurable: Configurable =>
         configurable.setConf(conf)
diff --git a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
index 922030263756b..fc1710fbad0a3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
@@ -28,7 +28,7 @@ private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends P
 }
 
 private[spark]
-class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId])
+class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId])
   extends RDD[T](sc, Nil) {
 
   @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get)
@@ -64,7 +64,7 @@ class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds
    */
   private[spark] def removeBlocks() {
     blockIds.foreach { blockId =>
-      sc.env.blockManager.master.removeBlock(blockId)
+      sparkContext.env.blockManager.master.removeBlock(blockId)
     }
     _isValid = false
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
index c1d6971787572..18e8cddbc40db 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala
@@ -27,8 +27,8 @@ import org.apache.spark.util.Utils
 private[spark]
 class CartesianPartition(
     idx: Int,
-    @transient rdd1: RDD[_],
-    @transient rdd2: RDD[_],
+    @transient private val rdd1: RDD[_],
+    @transient private val rdd2: RDD[_],
     s1Index: Int,
     s2Index: Int
   ) extends Partition {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
index 72fe215dae73e..b0364623af4cf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
@@ -29,7 +29,7 @@ private[spark] class CheckpointRDDPartition(val index: Int) extends Partition
 /**
  * An RDD that recovers checkpointed data from storage.
  */
-private[spark] abstract class CheckpointRDD[T: ClassTag](@transient sc: SparkContext)
+private[spark] abstract class CheckpointRDD[T: ClassTag](sc: SparkContext)
   extends RDD[T](sc, Nil) {
 
   // CheckpointRDD should not be checkpointed again
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 9c617fc719cb5..935c3babd8ea1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -22,10 +22,11 @@ import scala.language.existentials
 import java.io.{IOException, ObjectOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
 
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.util.collection.{ExternalAppendOnlyMap, AppendOnlyMap, CompactBuffer}
+import org.apache.spark.util.collection.{CompactBuffer, ExternalAppendOnlyMap}
 import org.apache.spark.util.Utils
 import org.apache.spark.serializer.Serializer
 
@@ -74,7 +75,9 @@ private[spark] class CoGroupPartition(
  * @param part partitioner used to partition the shuffle output
  */
 @DeveloperApi
-class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part: Partitioner)
+class CoGroupedRDD[K: ClassTag](
+    @transient var rdds: Seq[RDD[_ <: Product2[K, _]]],
+    part: Partitioner)
   extends RDD[(K, Array[Iterable[_]])](rdds.head.context, Nil) {
 
   // For example, `(k, a) cogroup (k, b)` produces k -> Array(ArrayBuffer as, ArrayBuffer bs).
@@ -125,8 +128,6 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
   override val partitioner: Some[Partitioner] = Some(part)
 
   override def compute(s: Partition, context: TaskContext): Iterator[(K, Array[Iterable[_]])] = {
-    val sparkConf = SparkEnv.get.conf
-    val externalSorting = sparkConf.getBoolean("spark.shuffle.spill", true)
     val split = s.asInstanceOf[CoGroupPartition]
     val numRdds = dependencies.length
 
@@ -147,34 +148,16 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
         rddIterators += ((it, depNum))
     }
 
-    if (!externalSorting) {
-      val map = new AppendOnlyMap[K, CoGroupCombiner]
-      val update: (Boolean, CoGroupCombiner) => CoGroupCombiner = (hadVal, oldVal) => {
-        if (hadVal) oldVal else Array.fill(numRdds)(new CoGroup)
-      }
-      val getCombiner: K => CoGroupCombiner = key => {
-        map.changeValue(key, update)
-      }
-      rddIterators.foreach { case (it, depNum) =>
-        while (it.hasNext) {
-          val kv = it.next()
-          getCombiner(kv._1)(depNum) += kv._2
-        }
-      }
-      new InterruptibleIterator(context,
-        map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
-    } else {
-      val map = createExternalMap(numRdds)
-      for ((it, depNum) <- rddIterators) {
-        map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
-      }
-      context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)
-      context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)
-      context.internalMetricsToAccumulators(
-        InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes)
-      new InterruptibleIterator(context,
-        map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
+    val map = createExternalMap(numRdds)
+    for ((it, depNum) <- rddIterators) {
+      map.insertAll(it.map(pair => (pair._1, new CoGroupValue(pair._2, depNum))))
     }
+    context.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)
+    context.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)
+    context.internalMetricsToAccumulators(
+      InternalAccumulator.PEAK_EXECUTION_MEMORY).add(map.peakMemoryUsedBytes)
+    new InterruptibleIterator(context,
+      map.iterator.asInstanceOf[Iterator[(K, Array[Iterable[_]])]])
   }
 
   private def createExternalMap(numRdds: Int)
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index e1f8719eead02..77b57132b9f1f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel
 /**
  * A Spark split class that wraps around a Hadoop InputSplit.
  */
-private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSplit)
+private[spark] class HadoopPartition(rddId: Int, idx: Int, s: InputSplit)
   extends Partition {
 
   val inputSplit = new SerializableWritable[InputSplit](s)
@@ -99,7 +99,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
  */
 @DeveloperApi
 class HadoopRDD[K, V](
-    @transient sc: SparkContext,
+    sc: SparkContext,
     broadcastedConf: Broadcast[SerializableConfiguration],
     initLocalJobConfFuncOpt: Option[JobConf => Unit],
     inputFormatClass: Class[_ <: InputFormat[K, V]],
@@ -109,7 +109,7 @@ class HadoopRDD[K, V](
   extends RDD[(K, V)](sc, Nil) with Logging {
 
   if (initLocalJobConfFuncOpt.isDefined) {
-    sc.clean(initLocalJobConfFuncOpt.get)
+    sparkContext.clean(initLocalJobConfFuncOpt.get)
   }
 
   def this(
@@ -137,7 +137,7 @@ class HadoopRDD[K, V](
   // used to build JobTracker ID
   private val createTime = new Date()
 
-  private val shouldCloneJobConf = sc.conf.getBoolean("spark.hadoop.cloneConf", false)
+  private val shouldCloneJobConf = sparkContext.conf.getBoolean("spark.hadoop.cloneConf", false)
 
   // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
   protected def getJobConf(): JobConf = {
@@ -182,17 +182,11 @@ class HadoopRDD[K, V](
   }
 
   protected def getInputFormat(conf: JobConf): InputFormat[K, V] = {
-    if (HadoopRDD.containsCachedMetadata(inputFormatCacheKey)) {
-      return HadoopRDD.getCachedMetadata(inputFormatCacheKey).asInstanceOf[InputFormat[K, V]]
-    }
-    // Once an InputFormat for this RDD is created, cache it so that only one reflection call is
-    // done in each local process.
     val newInputFormat = ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)
       .asInstanceOf[InputFormat[K, V]]
     if (newInputFormat.isInstanceOf[Configurable]) {
       newInputFormat.asInstanceOf[Configurable].setConf(conf)
     }
-    HadoopRDD.putCachedMetadata(inputFormatCacheKey, newInputFormat)
     newInputFormat
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala
index daa5779d688cc..bfe19195fcd37 100644
--- a/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/LocalCheckpointRDD.scala
@@ -35,7 +35,7 @@ import org.apache.spark.storage.RDDBlockId
  * @param numPartitions the number of partitions in the checkpointed RDD
  */
 private[spark] class LocalCheckpointRDD[T: ClassTag](
-    @transient sc: SparkContext,
+    sc: SparkContext,
     rddId: Int,
     numPartitions: Int)
   extends CheckpointRDD[T](sc) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala
index d6fad896845f6..c115e0ff74d3c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala
@@ -31,7 +31,7 @@ import org.apache.spark.util.Utils
  * is written to the local, ephemeral block storage that lives in each executor. This is useful
  * for use cases where RDDs build up long lineages that need to be truncated often (e.g. GraphX).
  */
-private[spark] class LocalRDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
+private[spark] class LocalRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
   extends RDDCheckpointData[T](rdd) with Logging {
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
index b475bd8d79f85..417ff5278db2a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDD.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.rdd
 
+import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Partition, Partitioner, TaskContext}
@@ -38,12 +39,28 @@ private[spark] class MapPartitionsWithPreparationRDD[U: ClassTag, T: ClassTag, M
 
   override def getPartitions: Array[Partition] = firstParent[T].partitions
 
+  // In certain join operations, prepare can be called on the same partition multiple times.
+  // In this case, we need to ensure that each call to compute gets a separate prepare argument.
+  private[this] val preparedArguments: ArrayBuffer[M] = new ArrayBuffer[M]
+
+  /**
+   * Prepare a partition for a single call to compute.
+   */
+  def prepare(): Unit = {
+    preparedArguments += preparePartition()
+  }
+
   /**
    * Prepare a partition before computing it from its parent.
    */
   override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
-    val preparedArgument = preparePartition()
+    val prepared =
+      if (preparedArguments.isEmpty) {
+        preparePartition()
+      } else {
+        preparedArguments.remove(0)
+      }
     val parentIterator = firstParent[T].iterator(partition, context)
-    executePartition(context, partition.index, preparedArgument, parentIterator)
+    executePartition(context, partition.index, prepared, parentIterator)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 6a9c004d65cff..2872b93b8730e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -40,11 +40,10 @@ import org.apache.spark.storage.StorageLevel
 private[spark] class NewHadoopPartition(
     rddId: Int,
     val index: Int,
-    @transient rawSplit: InputSplit with Writable)
+    rawSplit: InputSplit with Writable)
   extends Partition {
 
   val serializableHadoopSplit = new SerializableWritable(rawSplit)
-
   override def hashCode(): Int = 41 * (41 + rddId) + index
 }
 
@@ -68,14 +67,14 @@ class NewHadoopRDD[K, V](
     inputFormatClass: Class[_ <: InputFormat[K, V]],
     keyClass: Class[K],
     valueClass: Class[V],
-    @transient conf: Configuration)
+    @transient private val _conf: Configuration)
   extends RDD[(K, V)](sc, Nil)
   with SparkHadoopMapReduceUtil
   with Logging {
 
   // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it
-  private val confBroadcast = sc.broadcast(new SerializableConfiguration(conf))
-  // private val serializableConf = new SerializableWritable(conf)
+  private val confBroadcast = sc.broadcast(new SerializableConfiguration(_conf))
+  // private val serializableConf = new SerializableWritable(_conf)
 
   private val jobTrackerId: String = {
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
@@ -84,14 +83,35 @@ class NewHadoopRDD[K, V](
 
   @transient protected val jobId = new JobID(jobTrackerId, id)
 
+  private val shouldCloneJobConf = sparkContext.conf.getBoolean("spark.hadoop.cloneConf", false)
+
+  def getConf: Configuration = {
+    val conf: Configuration = confBroadcast.value.value
+    if (shouldCloneJobConf) {
+      // Hadoop Configuration objects are not thread-safe, which may lead to various problems if
+      // one job modifies a configuration while another reads it (SPARK-2546, SPARK-10611).  This
+      // problem occurs somewhat rarely because most jobs treat the configuration as though it's
+      // immutable.  One solution, implemented here, is to clone the Configuration object.
+      // Unfortunately, this clone can be very expensive.  To avoid unexpected performance
+      // regressions for workloads and Hadoop versions that do not suffer from these thread-safety
+      // issues, this cloning is disabled by default.
+      NewHadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
+        logDebug("Cloning Hadoop Configuration")
+        new Configuration(conf)
+      }
+    } else {
+      conf
+    }
+  }
+
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
     inputFormat match {
       case configurable: Configurable =>
-        configurable.setConf(conf)
+        configurable.setConf(_conf)
       case _ =>
     }
-    val jobContext = newJobContext(conf, jobId)
+    val jobContext = newJobContext(_conf, jobId)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
@@ -104,7 +124,7 @@ class NewHadoopRDD[K, V](
     val iter = new Iterator[(K, V)] {
       val split = theSplit.asInstanceOf[NewHadoopPartition]
       logInfo("Input split: " + split.serializableHadoopSplit)
-      val conf = confBroadcast.value.value
+      val conf = getConf
 
       val inputMetrics = context.taskMetrics
         .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
@@ -230,11 +250,15 @@ class NewHadoopRDD[K, V](
     super.persist(storageLevel)
   }
 
-
-  def getConf: Configuration = confBroadcast.value.value
 }
 
 private[spark] object NewHadoopRDD {
+  /**
+   * Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
+   * Therefore, we synchronize on this lock before calling new Configuration().
+   */
+  val CONFIGURATION_INSTANTIATION_LOCK = new Object()
+
   /**
    * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
    * the given function rather than the index of the partition.
@@ -262,12 +286,13 @@ private[spark] class WholeTextFileRDD(
     inputFormatClass: Class[_ <: WholeTextFileInputFormat],
     keyClass: Class[String],
     valueClass: Class[String],
-    @transient conf: Configuration,
+    conf: Configuration,
     minPartitions: Int)
   extends NewHadoopRDD[String, String](sc, inputFormatClass, keyClass, valueClass, conf) {
 
   override def getPartitions: Array[Partition] = {
     val inputFormat = inputFormatClass.newInstance
+    val conf = getConf
     inputFormat match {
       case configurable: Configurable =>
         configurable.setConf(conf)
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 4e5f2e8a5d467..a981b63942e6d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -57,7 +57,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   with SparkHadoopMapReduceUtil
   with Serializable
 {
+
   /**
+   * :: Experimental ::
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
    * Note that V and C can be different -- for example, one might group an RDD of type
@@ -70,12 +72,14 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * In addition, users can control the partitioning of the output RDD, and whether to perform
    * map-side aggregation (if a mapper can produce multiple items with the same key).
    */
-  def combineByKey[C](createCombiner: V => C,
+  @Experimental
+  def combineByKeyWithClassTag[C](
+      createCombiner: V => C,
       mergeValue: (C, V) => C,
       mergeCombiners: (C, C) => C,
       partitioner: Partitioner,
       mapSideCombine: Boolean = true,
-      serializer: Serializer = null): RDD[(K, C)] = self.withScope {
+      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
     require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
     if (keyClass.isArray) {
       if (mapSideCombine) {
@@ -103,13 +107,50 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   /**
-   * Simplified version of combineByKey that hash-partitions the output RDD.
+   * Generic function to combine the elements for each key using a custom set of aggregation
+   * functions. This method is here for backward compatibility. It does not provide combiner
+   * classtag information to the shuffle.
+   *
+   * @see [[combineByKeyWithClassTag]]
    */
-  def combineByKey[C](createCombiner: V => C,
+  def combineByKey[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C,
+      partitioner: Partitioner,
+      mapSideCombine: Boolean = true,
+      serializer: Serializer = null): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
+      partitioner, mapSideCombine, serializer)(null)
+  }
+
+  /**
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the output RDD.
+   * This method is here for backward compatibility. It does not provide combiner
+   * classtag information to the shuffle.
+   *
+   * @see [[combineByKeyWithClassTag]]
+   */
+  def combineByKey[C](
+      createCombiner: V => C,
       mergeValue: (C, V) => C,
       mergeCombiners: (C, C) => C,
       numPartitions: Int): RDD[(K, C)] = self.withScope {
-    combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numPartitions))
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, numPartitions)(null)
+  }
+
+  /**
+   * :: Experimental ::
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the output RDD.
+   */
+  @Experimental
+  def combineByKeyWithClassTag[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C,
+      numPartitions: Int)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
+      new HashPartitioner(numPartitions))
   }
 
   /**
@@ -133,7 +174,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
     // We will clean the combiner closure later in `combineByKey`
     val cleanedSeqOp = self.context.clean(seqOp)
-    combineByKey[U]((v: V) => cleanedSeqOp(createZero(), v), cleanedSeqOp, combOp, partitioner)
+    combineByKeyWithClassTag[U]((v: V) => cleanedSeqOp(createZero(), v),
+      cleanedSeqOp, combOp, partitioner)
   }
 
   /**
@@ -182,7 +224,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val createZero = () => cachedSerializer.deserialize[V](ByteBuffer.wrap(zeroArray))
 
     val cleanedFunc = self.context.clean(func)
-    combineByKey[V]((v: V) => cleanedFunc(createZero(), v), cleanedFunc, cleanedFunc, partitioner)
+    combineByKeyWithClassTag[V]((v: V) => cleanedFunc(createZero(), v),
+      cleanedFunc, cleanedFunc, partitioner)
   }
 
   /**
@@ -268,7 +311,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * "combiner" in MapReduce.
    */
   def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
-    combineByKey[V]((v: V) => v, func, func, partitioner)
+    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
   }
 
   /**
@@ -392,7 +435,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       h1
     }
 
-    combineByKey(createHLL, mergeValueHLL, mergeHLL, partitioner).mapValues(_.cardinality())
+    combineByKeyWithClassTag(createHLL, mergeValueHLL, mergeHLL, partitioner)
+      .mapValues(_.cardinality())
   }
 
   /**
@@ -466,7 +510,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val createCombiner = (v: V) => CompactBuffer(v)
     val mergeValue = (buf: CompactBuffer[V], v: V) => buf += v
     val mergeCombiners = (c1: CompactBuffer[V], c2: CompactBuffer[V]) => c1 ++= c2
-    val bufs = combineByKey[CompactBuffer[V]](
+    val bufs = combineByKeyWithClassTag[CompactBuffer[V]](
       createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine = false)
     bufs.asInstanceOf[RDD[(K, Iterable[V])]]
   }
@@ -565,12 +609,30 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   }
 
   /**
-   * Simplified version of combineByKey that hash-partitions the resulting RDD using the
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the resulting RDD using the
+   * existing partitioner/parallelism level. This method is here for backward compatibility. It
+   * does not provide combiner classtag information to the shuffle.
+   *
+   * @see [[combineByKeyWithClassTag]]
+   */
+  def combineByKey[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners)(null)
+  }
+
+  /**
+   * :: Experimental ::
+   * Simplified version of combineByKeyWithClassTag that hash-partitions the resulting RDD using the
    * existing partitioner/parallelism level.
    */
-  def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C)
-    : RDD[(K, C)] = self.withScope {
-    combineByKey(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
+  @Experimental
+  def combineByKeyWithClassTag[C](
+      createCombiner: V => C,
+      mergeValue: (C, V) => C,
+      mergeCombiners: (C, C) => C)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
+    combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
   }
 
   /**
@@ -934,8 +996,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     job.setOutputKeyClass(keyClass)
     job.setOutputValueClass(valueClass)
     job.setOutputFormatClass(outputFormatClass)
-    job.getConfiguration.set("mapred.output.dir", path)
-    saveAsNewAPIHadoopDataset(job.getConfiguration)
+    val jobConfiguration = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    jobConfiguration.set("mapred.output.dir", path)
+    saveAsNewAPIHadoopDataset(jobConfiguration)
   }
 
   /**
@@ -955,6 +1018,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   /**
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD.
+   *
+   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * not use output committer that writes data directly.
+   * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
+   * result of using direct output committer with speculation enabled.
    */
   def saveAsHadoopFile(
       path: String,
@@ -967,10 +1035,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val hadoopConf = conf
     hadoopConf.setOutputKeyClass(keyClass)
     hadoopConf.setOutputValueClass(valueClass)
-    // Doesn't work in Scala 2.9 due to what may be a generics bug
-    // TODO: Should we uncomment this for Scala 2.10?
-    // conf.setOutputFormat(outputFormatClass)
-    hadoopConf.set("mapred.output.format.class", outputFormatClass.getName)
+    conf.setOutputFormat(outputFormatClass)
     for (c <- codec) {
       hadoopConf.setCompressMapOutput(true)
       hadoopConf.set("mapred.output.compress", "true")
@@ -984,6 +1049,19 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       hadoopConf.setOutputCommitter(classOf[FileOutputCommitter])
     }
 
+    // When speculation is on and output committer class name contains "Direct", we should warn
+    // users that they may loss data if they are using a direct output committer.
+    val speculationEnabled = self.conf.getBoolean("spark.speculation", false)
+    val outputCommitterClass = hadoopConf.get("mapred.output.committer.class", "")
+    if (speculationEnabled && outputCommitterClass.contains("Direct")) {
+      val warningMessage =
+        s"$outputCommitterClass may be an output committer that writes data directly to " +
+          "the final location. Because speculation is enabled, this output committer may " +
+          "cause data loss (see the case in SPARK-10063). If possible, please use a output " +
+          "committer that does not have this behavior (e.g. FileOutputCommitter)."
+      logWarning(warningMessage)
+    }
+
     FileOutputFormat.setOutputPath(hadoopConf,
       SparkHadoopWriter.createPathFromString(path, hadoopConf))
     saveAsHadoopDataset(hadoopConf)
@@ -994,6 +1072,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Configuration object for that storage system. The Conf should set an OutputFormat and any
    * output paths required (e.g. a table name to write to) in the same way as it would be
    * configured for a Hadoop MapReduce job.
+   *
+   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * not use output committer that writes data directly.
+   * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
+   * result of using direct output committer with speculation enabled.
    */
   def saveAsNewAPIHadoopDataset(conf: Configuration): Unit = self.withScope {
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
@@ -1002,7 +1085,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
     val jobtrackerID = formatter.format(new Date())
     val stageId = self.id
-    val wrappedConf = new SerializableConfiguration(job.getConfiguration)
+    val jobConfiguration = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    val wrappedConf = new SerializableConfiguration(jobConfiguration)
     val outfmt = job.getOutputFormatClass
     val jobFormat = outfmt.newInstance
 
@@ -1051,6 +1135,20 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true, 0, 0)
     val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
     val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
+
+    // When speculation is on and output committer class name contains "Direct", we should warn
+    // users that they may loss data if they are using a direct output committer.
+    val speculationEnabled = self.conf.getBoolean("spark.speculation", false)
+    val outputCommitterClass = jobCommitter.getClass.getSimpleName
+    if (speculationEnabled && outputCommitterClass.contains("Direct")) {
+      val warningMessage =
+        s"$outputCommitterClass may be an output committer that writes data directly to " +
+          "the final location. Because speculation is enabled, this output committer may " +
+          "cause data loss (see the case in SPARK-10063). If possible, please use a output " +
+          "committer that does not have this behavior (e.g. FileOutputCommitter)."
+      logWarning(warningMessage)
+    }
+
     jobCommitter.setupJob(jobTaskContext)
     self.context.runJob(self, writeShard)
     jobCommitter.commitJob(jobTaskContext)
@@ -1065,7 +1163,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def saveAsHadoopDataset(conf: JobConf): Unit = self.withScope {
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
     val hadoopConf = conf
-    val wrappedConf = new SerializableConfiguration(hadoopConf)
     val outputFormatInstance = hadoopConf.getOutputFormat
     val keyClass = hadoopConf.getOutputKeyClass
     val valueClass = hadoopConf.getOutputValueClass
@@ -1093,7 +1190,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     writer.preSetup()
 
     val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
-      val config = wrappedConf.value
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
       val taskAttemptId = (context.taskAttemptId % Int.MaxValue).toInt
diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
index e2394e28f8d26..582fa93afe34e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala
@@ -83,8 +83,8 @@ private[spark] class ParallelCollectionPartition[T: ClassTag](
 }
 
 private[spark] class ParallelCollectionRDD[T: ClassTag](
-    @transient sc: SparkContext,
-    @transient data: Seq[T],
+    sc: SparkContext,
+    @transient private val data: Seq[T],
     numSlices: Int,
     locationPrefs: Map[Int, Seq[String]])
     extends RDD[T](sc, Nil) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
index a00f4c1cdff91..d6a37e8cc5dac 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
@@ -32,7 +32,7 @@ private[spark] class PartitionPruningRDDPartition(idx: Int, val parentSplit: Par
  * Represents a dependency between the PartitionPruningRDD and its parent. In this
  * case, the child RDD contains a subset of partitions of the parents'.
  */
-private[spark] class PruneDependency[T](rdd: RDD[T], @transient partitionFilterFunc: Int => Boolean)
+private[spark] class PruneDependency[T](rdd: RDD[T], partitionFilterFunc: Int => Boolean)
   extends NarrowDependency[T](rdd) {
 
   @transient
@@ -55,8 +55,8 @@ private[spark] class PruneDependency[T](rdd: RDD[T], @transient partitionFilterF
  */
 @DeveloperApi
 class PartitionPruningRDD[T: ClassTag](
-    @transient prev: RDD[T],
-    @transient partitionFilterFunc: Int => Boolean)
+    prev: RDD[T],
+    partitionFilterFunc: Int => Boolean)
   extends RDD[T](prev.context, List(new PruneDependency(prev, partitionFilterFunc))) {
 
   override def compute(split: Partition, context: TaskContext): Iterator[T] = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
index a637d6f15b7e5..3b1acacf409b9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -47,8 +47,8 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
 private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
     prev: RDD[T],
     sampler: RandomSampler[T, U],
-    @transient preservesPartitioning: Boolean,
-    @transient seed: Long = Utils.random.nextLong)
+    preservesPartitioning: Boolean,
+    @transient private val seed: Long = Utils.random.nextLong)
   extends RDD[U](prev) {
 
   @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 081c721f23687..a56e542242d5f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -469,50 +469,44 @@ abstract class RDD[T: ClassTag](
    * @param seed seed for the random number generator
    * @return sample of specified size in an array
    */
-  // TODO: rewrite this without return statements so we can wrap it in a scope
   def takeSample(
       withReplacement: Boolean,
       num: Int,
-      seed: Long = Utils.random.nextLong): Array[T] = {
+      seed: Long = Utils.random.nextLong): Array[T] = withScope {
     val numStDev = 10.0
 
-    if (num < 0) {
-      throw new IllegalArgumentException("Negative number of elements requested")
-    } else if (num == 0) {
-      return new Array[T](0)
-    }
-
-    val initialCount = this.count()
-    if (initialCount == 0) {
-      return new Array[T](0)
-    }
-
-    val maxSampleSize = Int.MaxValue - (numStDev * math.sqrt(Int.MaxValue)).toInt
-    if (num > maxSampleSize) {
-      throw new IllegalArgumentException("Cannot support a sample size > Int.MaxValue - " +
-        s"$numStDev * math.sqrt(Int.MaxValue)")
-    }
-
-    val rand = new Random(seed)
-    if (!withReplacement && num >= initialCount) {
-      return Utils.randomizeInPlace(this.collect(), rand)
-    }
-
-    val fraction = SamplingUtils.computeFractionForSampleSize(num, initialCount,
-      withReplacement)
-
-    var samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
+    require(num >= 0, "Negative number of elements requested")
+    require(num <= (Int.MaxValue - (numStDev * math.sqrt(Int.MaxValue)).toInt),
+      "Cannot support a sample size > Int.MaxValue - " +
+      s"$numStDev * math.sqrt(Int.MaxValue)")
 
-    // If the first sample didn't turn out large enough, keep trying to take samples;
-    // this shouldn't happen often because we use a big multiplier for the initial size
-    var numIters = 0
-    while (samples.length < num) {
-      logWarning(s"Needed to re-sample due to insufficient sample size. Repeat #$numIters")
-      samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
-      numIters += 1
+    if (num == 0) {
+      new Array[T](0)
+    } else {
+      val initialCount = this.count()
+      if (initialCount == 0) {
+        new Array[T](0)
+      } else {
+        val rand = new Random(seed)
+        if (!withReplacement && num >= initialCount) {
+          Utils.randomizeInPlace(this.collect(), rand)
+        } else {
+          val fraction = SamplingUtils.computeFractionForSampleSize(num, initialCount,
+            withReplacement)
+          var samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
+
+          // If the first sample didn't turn out large enough, keep trying to take samples;
+          // this shouldn't happen often because we use a big multiplier for the initial size
+          var numIters = 0
+          while (samples.length < num) {
+            logWarning(s"Needed to re-sample due to insufficient sample size. Repeat #$numIters")
+            samples = this.sample(withReplacement, fraction, rand.nextInt()).collect()
+            numIters += 1
+          }
+          Utils.randomizeInPlace(samples, rand).take(num)
+        }
+      }
     }
-
-    Utils.randomizeInPlace(samples, rand).take(num)
   }
 
   /**
@@ -1666,7 +1660,7 @@ abstract class RDD[T: ClassTag](
       import Utils.bytesToString
 
       val persistence = if (storageLevel != StorageLevel.NONE) storageLevel.description else ""
-      val storageInfo = rdd.context.getRDDStorageInfo.filter(_.id == rdd.id).map(info =>
+      val storageInfo = rdd.context.getRDDStorageInfo(_.id == rdd.id).map(info =>
         "    CachedPartitions: %d; MemorySize: %s; ExternalBlockStoreSize: %s; DiskSize: %s".format(
           info.numCachedPartitions, bytesToString(info.memSize),
           bytesToString(info.externalBlockStoreSize), bytesToString(info.diskSize)))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index 0e43520870c0a..429514b4f6bee 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -36,7 +36,7 @@ private[spark] object CheckpointState extends Enumeration {
  * as well as, manages the post-checkpoint state by providing the updated partitions,
  * iterator and preferred locations of the checkpointed RDD.
  */
-private[spark] abstract class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
+private[spark] abstract class RDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
   extends Serializable {
 
   import CheckpointState._
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
index 44667281c1063..540cbd688b63b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala
@@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.{JsonIgnore, JsonInclude, JsonPropertyOr
 import com.fasterxml.jackson.annotation.JsonInclude.Include
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.base.Objects
 
 import org.apache.spark.{Logging, SparkContext}
 
@@ -67,6 +68,8 @@ private[spark] class RDDOperationScope(
     }
   }
 
+  override def hashCode(): Int = Objects.hashCode(id, name, parent)
+
   override def toString: String = toJson
 }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
index 35d8b0bfd18c5..a69be6a068bbf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -32,7 +32,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
  * An RDD that reads from checkpoint files previously written to reliable storage.
  */
 private[spark] class ReliableCheckpointRDD[T: ClassTag](
-    @transient sc: SparkContext,
+    sc: SparkContext,
     val checkpointPath: String)
   extends CheckpointRDD[T](sc) {
 
@@ -144,7 +144,9 @@ private[spark] object ReliableCheckpointRDD extends Logging {
       } else {
         // Some other copy of this task must've finished before us and renamed it
         logInfo(s"Final output path $finalOutputPath already exists; not overwriting it")
-        fs.delete(tempOutputPath, false)
+        if (!fs.delete(tempOutputPath, false)) {
+          logWarning(s"Error deleting ${tempOutputPath}")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
index 1df8eef5ff2b9..91cad6662e4d2 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableRDDCheckpointData.scala
@@ -28,7 +28,7 @@ import org.apache.spark.util.SerializableConfiguration
  * An implementation of checkpointing that writes the RDD data to reliable storage.
  * This allows drivers to be restarted on failure with previously computed state.
  */
-private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient rdd: RDD[T])
+private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
   extends RDDCheckpointData[T](rdd) with Logging {
 
   // The directory to which the associated RDD has been checkpointed to
@@ -89,7 +89,7 @@ private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient rdd: RDD[
 
 }
 
-private[spark] object ReliableRDDCheckpointData {
+private[spark] object ReliableRDDCheckpointData extends Logging {
 
   /** Return the path of the directory to which this RDD's checkpoint data is written. */
   def checkpointPath(sc: SparkContext, rddId: Int): Option[Path] = {
@@ -101,7 +101,9 @@ private[spark] object ReliableRDDCheckpointData {
     checkpointPath(sc, rddId).foreach { path =>
       val fs = path.getFileSystem(sc.hadoopConfiguration)
       if (fs.exists(path)) {
-        fs.delete(path, true)
+        if (!fs.delete(path, true)) {
+          logWarning(s"Error deleting ${path.toString()}")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
index 2dc47f95937cb..a013c3f66a3a8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.rdd
 
+import scala.reflect.ClassTag
+
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.serializer.Serializer
@@ -37,7 +39,7 @@ private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
  */
 // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs
 @DeveloperApi
-class ShuffledRDD[K, V, C](
+class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
     @transient var prev: RDD[_ <: Product2[K, V]],
     part: Partitioner)
   extends RDD[(K, C)](prev.context, Nil) {
@@ -84,6 +86,12 @@ class ShuffledRDD[K, V, C](
     Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
   }
 
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
+    tracker.getPreferredLocationsForShuffle(dep, partition.index)
+  }
+
   override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
     val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
     SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
diff --git a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
index fa3fecc80cb63..0228c54e0511c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
@@ -39,7 +39,7 @@ import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, Ut
 private[spark] class SqlNewHadoopPartition(
     rddId: Int,
     val index: Int,
-    @transient rawSplit: InputSplit with Writable)
+    rawSplit: InputSplit with Writable)
   extends SparkPartition {
 
   val serializableHadoopSplit = new SerializableWritable(rawSplit)
@@ -61,9 +61,9 @@ private[spark] class SqlNewHadoopPartition(
  * changes based on [[org.apache.spark.rdd.HadoopRDD]].
  */
 private[spark] class SqlNewHadoopRDD[V: ClassTag](
-    @transient sc : SparkContext,
+    sc : SparkContext,
     broadcastedConf: Broadcast[SerializableConfiguration],
-    @transient initDriverSideJobFuncOpt: Option[Job => Unit],
+    @transient private val initDriverSideJobFuncOpt: Option[Job => Unit],
     initLocalJobFuncOpt: Option[Job => Unit],
     inputFormatClass: Class[_ <: InputFormat[Void, V]],
     valueClass: Class[V])
@@ -86,7 +86,7 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
     if (isDriverSide) {
       initDriverSideJobFuncOpt.map(f => f(job))
     }
-    job.getConfiguration
+    SparkHadoopUtil.get.getConfigurationFromJobContext(job)
   }
 
   private val jobTrackerId: String = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
index 9a4fa301b06e3..25ec685eff5ab 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala
@@ -63,15 +63,17 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
   }
 
   override def getDependencies: Seq[Dependency[_]] = {
-    Seq(rdd1, rdd2).map { rdd =>
+    def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]])
+      : Dependency[_] = {
       if (rdd.partitioner == Some(part)) {
         logDebug("Adding one-to-one dependency with " + rdd)
         new OneToOneDependency(rdd)
       } else {
         logDebug("Adding shuffle dependency with " + rdd)
-        new ShuffleDependency(rdd, part, serializer)
+        new ShuffleDependency[T1, T2, Any](rdd, part, serializer)
       }
     }
+    Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2))
   }
 
   override def getPartitions: Array[Partition] = {
@@ -105,7 +107,7 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
         seq
       }
     }
-    def integrate(depNum: Int, op: Product2[K, V] => Unit) = {
+    def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = {
       dependencies(depNum) match {
         case oneToOneDependency: OneToOneDependency[_] =>
           val dependencyPartition = partition.narrowDeps(depNum).get.split
diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
index 3986645350a82..66cf4369da2ef 100644
--- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
@@ -37,9 +37,9 @@ import org.apache.spark.util.Utils
  */
 private[spark] class UnionPartition[T: ClassTag](
     idx: Int,
-    @transient rdd: RDD[T],
+    @transient private val rdd: RDD[T],
     val parentRddIndex: Int,
-    @transient parentRddPartitionIndex: Int)
+    @transient private val parentRddPartitionIndex: Int)
   extends Partition {
 
   var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
index 81f40ad33aa5d..70bf04de6400d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.util.Utils
 
 private[spark] class ZippedPartitionsPartition(
     idx: Int,
-    @transient rdds: Seq[RDD[_]],
+    @transient private val rdds: Seq[RDD[_]],
     @transient val preferredLocations: Seq[String])
   extends Partition {
 
@@ -73,6 +73,16 @@ private[spark] abstract class ZippedPartitionsBaseRDD[V: ClassTag](
     super.clearDependencies()
     rdds = null
   }
+
+  /**
+   * Call the prepare method of every parent that has one.
+   * This is needed for reserving execution memory in advance.
+   */
+  protected def tryPrepareParents(): Unit = {
+    rdds.collect {
+      case rdd: MapPartitionsWithPreparationRDD[_, _, _] => rdd.prepare()
+    }
+  }
 }
 
 private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag](
@@ -84,6 +94,7 @@ private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag]
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
+    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context))
   }
@@ -107,6 +118,7 @@ private[spark] class ZippedPartitionsRDD3
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
+    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context),
       rdd2.iterator(partitions(1), context),
@@ -134,6 +146,7 @@ private[spark] class ZippedPartitionsRDD4
   extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2, rdd3, rdd4), preservesPartitioning) {
 
   override def compute(s: Partition, context: TaskContext): Iterator[V] = {
+    tryPrepareParents()
     val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
     f(rdd1.iterator(partitions(0), context),
       rdd2.iterator(partitions(1), context),
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index e277ae28d588f..32931d59acb18 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -37,7 +37,7 @@ class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  * @tparam T parent RDD item type
  */
 private[spark]
-class ZippedWithIndexRDD[T: ClassTag](@transient prev: RDD[T]) extends RDD[(T, Long)](prev) {
+class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev) {
 
   /** The start index of each partition. */
   @transient private val startIndices: Array[Long] = {
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
index 7409ac8859991..f25710bb5bd6e 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, Logging, SparkConf}
 /**
  * A reference for a remote [[RpcEndpoint]]. [[RpcEndpointRef]] is thread-safe.
  */
-private[spark] abstract class RpcEndpointRef(@transient conf: SparkConf)
+private[spark] abstract class RpcEndpointRef(conf: SparkConf)
   extends Serializable with Logging {
 
   private[this] val maxRetries = RpcUtils.numRetries(conf)
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index fc17542abf81d..ad67e1c5ad4d5 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -87,9 +87,9 @@ private[spark] class AkkaRpcEnv private[akka] (
 
   override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
     @volatile var endpointRef: AkkaRpcEndpointRef = null
-    // Use lazy because the Actor needs to use `endpointRef`.
+    // Use defered function because the Actor needs to use `endpointRef`.
     // So `actorRef` should be created after assigning `endpointRef`.
-    lazy val actorRef = actorSystem.actorOf(Props(new Actor with ActorLogReceive with Logging {
+    val actorRef = () => actorSystem.actorOf(Props(new Actor with ActorLogReceive with Logging {
 
       assert(endpointRef != null)
 
@@ -272,13 +272,20 @@ private[akka] class ErrorMonitor extends Actor with ActorLogReceive with Logging
 }
 
 private[akka] class AkkaRpcEndpointRef(
-    @transient defaultAddress: RpcAddress,
-    @transient _actorRef: => ActorRef,
-    @transient conf: SparkConf,
-    @transient initInConstructor: Boolean = true)
+    @transient private val defaultAddress: RpcAddress,
+    @transient private val _actorRef: () => ActorRef,
+    conf: SparkConf,
+    initInConstructor: Boolean)
   extends RpcEndpointRef(conf) with Logging {
 
-  lazy val actorRef = _actorRef
+  def this(
+      defaultAddress: RpcAddress,
+      _actorRef: ActorRef,
+      conf: SparkConf) = {
+    this(defaultAddress, () => _actorRef, conf, true)
+  }
+
+  lazy val actorRef = _actorRef()
 
   override lazy val address: RpcAddress = {
     val akkaAddress = actorRef.path.address
diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
index 11d123eec43ca..b6bff64ee368e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
@@ -34,9 +34,15 @@ class AccumulableInfo private[spark] (
   override def equals(other: Any): Boolean = other match {
     case acc: AccumulableInfo =>
       this.id == acc.id && this.name == acc.name &&
-        this.update == acc.update && this.value == acc.value
+        this.update == acc.update && this.value == acc.value &&
+        this.internal == acc.internal
     case _ => false
   }
+
+  override def hashCode(): Int = {
+    val state = Seq(id, name, update, value, internal)
+    state.map(_.hashCode).reduceLeft(31 * _ + _)
+  }
 }
 
 object AccumulableInfo {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala b/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala
index 50a69379412d2..a3d2db31301b3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ActiveJob.scala
@@ -23,18 +23,42 @@ import org.apache.spark.TaskContext
 import org.apache.spark.util.CallSite
 
 /**
- * Tracks information about an active job in the DAGScheduler.
+ * A running job in the DAGScheduler. Jobs can be of two types: a result job, which computes a
+ * ResultStage to execute an action, or a map-stage job, which computes the map outputs for a
+ * ShuffleMapStage before any downstream stages are submitted. The latter is used for adaptive
+ * query planning, to look at map output statistics before submitting later stages. We distinguish
+ * between these two types of jobs using the finalStage field of this class.
+ *
+ * Jobs are only tracked for "leaf" stages that clients directly submitted, through DAGScheduler's
+ * submitJob or submitMapStage methods. However, either type of job may cause the execution of
+ * other earlier stages (for RDDs in the DAG it depends on), and multiple jobs may share some of
+ * these previous stages. These dependencies are managed inside DAGScheduler.
+ *
+ * @param jobId A unique ID for this job.
+ * @param finalStage The stage that this job computes (either a ResultStage for an action or a
+ *   ShuffleMapStage for submitMapStage).
+ * @param callSite Where this job was initiated in the user's program (shown on UI).
+ * @param listener A listener to notify if tasks in this job finish or the job fails.
+ * @param properties Scheduling properties attached to the job, such as fair scheduler pool name.
  */
 private[spark] class ActiveJob(
     val jobId: Int,
-    val finalStage: ResultStage,
-    val func: (TaskContext, Iterator[_]) => _,
-    val partitions: Array[Int],
+    val finalStage: Stage,
     val callSite: CallSite,
     val listener: JobListener,
     val properties: Properties) {
 
-  val numPartitions = partitions.length
+  /**
+   * Number of partitions we need to compute for this job. Note that result stages may not need
+   * to compute all partitions in their target RDD, for actions like first() and lookup().
+   */
+  val numPartitions = finalStage match {
+    case r: ResultStage => r.partitions.length
+    case m: ShuffleMapStage => m.rdd.partitions.length
+  }
+
+  /** Which partitions of the stage have finished */
   val finished = Array.fill[Boolean](numPartitions)(false)
+
   var numFinished = 0
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index daf9b0f95273e..ade372be092ae 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -45,17 +45,65 @@ import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
  * The high-level scheduling layer that implements stage-oriented scheduling. It computes a DAG of
  * stages for each job, keeps track of which RDDs and stage outputs are materialized, and finds a
  * minimal schedule to run the job. It then submits stages as TaskSets to an underlying
- * TaskScheduler implementation that runs them on the cluster.
+ * TaskScheduler implementation that runs them on the cluster. A TaskSet contains fully independent
+ * tasks that can run right away based on the data that's already on the cluster (e.g. map output
+ * files from previous stages), though it may fail if this data becomes unavailable.
  *
- * In addition to coming up with a DAG of stages, this class also determines the preferred
+ * Spark stages are created by breaking the RDD graph at shuffle boundaries. RDD operations with
+ * "narrow" dependencies, like map() and filter(), are pipelined together into one set of tasks
+ * in each stage, but operations with shuffle dependencies require multiple stages (one to write a
+ * set of map output files, and another to read those files after a barrier). In the end, every
+ * stage will have only shuffle dependencies on other stages, and may compute multiple operations
+ * inside it. The actual pipelining of these operations happens in the RDD.compute() functions of
+ * various RDDs (MappedRDD, FilteredRDD, etc).
+ *
+ * In addition to coming up with a DAG of stages, the DAGScheduler also determines the preferred
  * locations to run each task on, based on the current cache status, and passes these to the
  * low-level TaskScheduler. Furthermore, it handles failures due to shuffle output files being
  * lost, in which case old stages may need to be resubmitted. Failures *within* a stage that are
  * not caused by shuffle file loss are handled by the TaskScheduler, which will retry each task
  * a small number of times before cancelling the whole stage.
  *
+ * When looking through this code, there are several key concepts:
+ *
+ *  - Jobs (represented by [[ActiveJob]]) are the top-level work items submitted to the scheduler.
+ *    For example, when the user calls an action, like count(), a job will be submitted through
+ *    submitJob. Each Job may require the execution of multiple stages to build intermediate data.
+ *
+ *  - Stages ([[Stage]]) are sets of tasks that compute intermediate results in jobs, where each
+ *    task computes the same function on partitions of the same RDD. Stages are separated at shuffle
+ *    boundaries, which introduce a barrier (where we must wait for the previous stage to finish to
+ *    fetch outputs). There are two types of stages: [[ResultStage]], for the final stage that
+ *    executes an action, and [[ShuffleMapStage]], which writes map output files for a shuffle.
+ *    Stages are often shared across multiple jobs, if these jobs reuse the same RDDs.
+ *
+ *  - Tasks are individual units of work, each sent to one machine.
+ *
+ *  - Cache tracking: the DAGScheduler figures out which RDDs are cached to avoid recomputing them
+ *    and likewise remembers which shuffle map stages have already produced output files to avoid
+ *    redoing the map side of a shuffle.
+ *
+ *  - Preferred locations: the DAGScheduler also computes where to run each task in a stage based
+ *    on the preferred locations of its underlying RDDs, or the location of cached or shuffle data.
+ *
+ *  - Cleanup: all data structures are cleared when the running jobs that depend on them finish,
+ *    to prevent memory leaks in a long-running application.
+ *
+ * To recover from failures, the same stage might need to run multiple times, which are called
+ * "attempts". If the TaskScheduler reports that a task failed because a map output file from a
+ * previous stage was lost, the DAGScheduler resubmits that lost stage. This is detected through a
+ * CompletionEvent with FetchFailed, or an ExecutorLost event. The DAGScheduler will wait a small
+ * amount of time to see whether other nodes or tasks fail, then resubmit TaskSets for any lost
+ * stage(s) that compute the missing tasks. As part of this process, we might also have to create
+ * Stage objects for old (finished) stages where we previously cleaned up the Stage object. Since
+ * tasks from the old attempt of a stage could still be running, care must be taken to map any
+ * events received in the correct Stage object.
+ *
  * Here's a checklist to use when making or reviewing changes to this class:
  *
+ *  - All data structures should be cleared when the jobs involving them end to avoid indefinite
+ *    accumulation of state in long-running programs.
+ *
  *  - When adding a new data structure, update `DAGSchedulerSuite.assertDataStructuresEmpty` to
  *    include the new structure. This will help to catch memory leaks.
  */
@@ -136,22 +184,6 @@ class DAGScheduler(
   private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
   taskScheduler.setDAGScheduler(this)
 
-  // Flag to control if reduce tasks are assigned preferred locations
-  private val shuffleLocalityEnabled =
-    sc.getConf.getBoolean("spark.shuffle.reduceLocality.enabled", true)
-  // Number of map, reduce tasks above which we do not assign preferred locations
-  // based on map output sizes. We limit the size of jobs for which assign preferred locations
-  // as computing the top locations by size becomes expensive.
-  private[this] val SHUFFLE_PREF_MAP_THRESHOLD = 1000
-  // NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that
-  private[this] val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
-
-  // Fraction of total map output that must be at a location for it to considered as a preferred
-  // location for a reduce task.
-  // Making this larger will focus on fewer locations where most data can be read locally, but
-  // may lead to more delay in scheduling if those locations are busy.
-  private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2
-
   /**
    * Called by the TaskSetManager to report task's starting.
    */
@@ -250,11 +282,12 @@ class DAGScheduler(
       case Some(stage) => stage
       case None =>
         // We are going to register ancestor shuffle dependencies
-        registerShuffleDependencies(shuffleDep, firstJobId)
+        getAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
+          shuffleToMapStage(dep.shuffleId) = newOrUsedShuffleStage(dep, firstJobId)
+        }
         // Then register current shuffleDep
         val stage = newOrUsedShuffleStage(shuffleDep, firstJobId)
         shuffleToMapStage(shuffleDep.shuffleId) = stage
-
         stage
     }
   }
@@ -294,12 +327,12 @@ class DAGScheduler(
    */
   private def newResultStage(
       rdd: RDD[_],
-      numTasks: Int,
+      func: (TaskContext, Iterator[_]) => _,
+      partitions: Array[Int],
       jobId: Int,
       callSite: CallSite): ResultStage = {
     val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, jobId)
-    val stage: ResultStage = new ResultStage(id, rdd, numTasks, parentStages, jobId, callSite)
-
+    val stage = new ResultStage(id, rdd, func, partitions, parentStages, jobId, callSite)
     stageIdToStage(id) = stage
     updateJobIdStageIdMaps(jobId, stage)
     stage
@@ -365,16 +398,6 @@ class DAGScheduler(
     parents.toList
   }
 
-  /** Find ancestor missing shuffle dependencies and register into shuffleToMapStage */
-  private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], firstJobId: Int) {
-    val parentsWithNoMapStage = getAncestorShuffleDependencies(shuffleDep.rdd)
-    while (parentsWithNoMapStage.nonEmpty) {
-      val currentShufDep = parentsWithNoMapStage.pop()
-      val stage = newOrUsedShuffleStage(currentShufDep, firstJobId)
-      shuffleToMapStage(currentShufDep.shuffleId) = stage
-    }
-  }
-
   /** Find ancestor shuffle dependencies that are not registered in shuffleToMapStage yet */
   private def getAncestorShuffleDependencies(rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = {
     val parents = new Stack[ShuffleDependency[_, _, _]]
@@ -391,11 +414,9 @@ class DAGScheduler(
               if (!shuffleToMapStage.contains(shufDep.shuffleId)) {
                 parents.push(shufDep)
               }
-
-              waitingForVisit.push(shufDep.rdd)
             case _ =>
-              waitingForVisit.push(dep.rdd)
           }
+          waitingForVisit.push(dep.rdd)
         }
       }
     }
@@ -511,12 +532,25 @@ class DAGScheduler(
     jobIdToStageIds -= job.jobId
     jobIdToActiveJob -= job.jobId
     activeJobs -= job
-    job.finalStage.resultOfJob = None
+    job.finalStage match {
+      case r: ResultStage =>
+        r.resultOfJob = None
+      case m: ShuffleMapStage =>
+        m.mapStageJobs = m.mapStageJobs.filter(_ != job)
+    }
   }
 
   /**
-   * Submit a job to the job scheduler and get a JobWaiter object back. The JobWaiter object
+   * Submit an action job to the scheduler and get a JobWaiter object back. The JobWaiter object
    * can be used to block until the the job finishes executing or can be used to cancel the job.
+   *
+   * @param rdd target RDD to run tasks on
+   * @param func a function to run on each partition of the RDD
+   * @param partitions set of partitions to run on; some jobs may not want to compute on all
+   *   partitions of the target RDD, e.g. for operations like first()
+   * @param callSite where in the user program this job was called
+   * @param resultHandler callback to pass each result to
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
    */
   def submitJob[T, U](
       rdd: RDD[T],
@@ -535,6 +569,7 @@ class DAGScheduler(
 
     val jobId = nextJobId.getAndIncrement()
     if (partitions.size == 0) {
+      // Return immediately if the job is running 0 tasks
       return new JobWaiter[U](this, jobId, 0, resultHandler)
     }
 
@@ -547,6 +582,18 @@ class DAGScheduler(
     waiter
   }
 
+  /**
+   * Run an action job on the given RDD and pass all the results to the resultHandler function as
+   * they arrive. Throws an exception if the job fials, or returns normally if successful.
+   *
+   * @param rdd target RDD to run tasks on
+   * @param func a function to run on each partition of the RDD
+   * @param partitions set of partitions to run on; some jobs may not want to compute on all
+   *   partitions of the target RDD, e.g. for operations like first()
+   * @param callSite where in the user program this job was called
+   * @param resultHandler callback to pass each result to
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
+   */
   def runJob[T, U](
       rdd: RDD[T],
       func: (TaskContext, Iterator[T]) => U,
@@ -570,6 +617,17 @@ class DAGScheduler(
     }
   }
 
+  /**
+   * Run an approximate job on the given RDD and pass all the results to an ApproximateEvaluator
+   * as they arrive. Returns a partial result object from the evaluator.
+   *
+   * @param rdd target RDD to run tasks on
+   * @param func a function to run on each partition of the RDD
+   * @param evaluator [[ApproximateEvaluator]] to receive the partial results
+   * @param callSite where in the user program this job was called
+   * @param timeout maximum time to wait for the job, in milliseconds
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
+   */
   def runApproximateJob[T, U, R](
       rdd: RDD[T],
       func: (TaskContext, Iterator[T]) => U,
@@ -586,6 +644,41 @@ class DAGScheduler(
     listener.awaitResult()    // Will throw an exception if the job fails
   }
 
+  /**
+   * Submit a shuffle map stage to run independently and get a JobWaiter object back. The waiter
+   * can be used to block until the the job finishes executing or can be used to cancel the job.
+   * This method is used for adaptive query planning, to run map stages and look at statistics
+   * about their outputs before submitting downstream stages.
+   *
+   * @param dependency the ShuffleDependency to run a map stage for
+   * @param callback function called with the result of the job, which in this case will be a
+   *   single MapOutputStatistics object showing how much data was produced for each partition
+   * @param callSite where in the user program this job was submitted
+   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
+   */
+  def submitMapStage[K, V, C](
+      dependency: ShuffleDependency[K, V, C],
+      callback: MapOutputStatistics => Unit,
+      callSite: CallSite,
+      properties: Properties): JobWaiter[MapOutputStatistics] = {
+
+    val rdd = dependency.rdd
+    val jobId = nextJobId.getAndIncrement()
+    if (rdd.partitions.length == 0) {
+      throw new SparkException("Can't run submitMapStage on RDD with 0 partitions")
+    }
+
+    // We create a JobWaiter with only one "task", which will be marked as complete when the whole
+    // map stage has completed, and will be passed the MapOutputStatistics for that stage.
+    // This makes it easier to avoid race conditions between the user code and the map output
+    // tracker that might result if we told the user the stage had finished, but then they queries
+    // the map output tracker and some node failures had caused the output statistics to be lost.
+    val waiter = new JobWaiter(this, jobId, 1, (i: Int, r: MapOutputStatistics) => callback(r))
+    eventProcessLoop.post(MapStageSubmitted(
+      jobId, dependency, callSite, waiter, SerializationUtils.clone(properties)))
+    waiter
+  }
+
   /**
    * Cancel a job that is running or waiting in the queue.
    */
@@ -594,6 +687,9 @@ class DAGScheduler(
     eventProcessLoop.post(JobCancelled(jobId))
   }
 
+  /**
+   * Cancel all jobs in the given job group ID.
+   */
   def cancelJobGroup(groupId: String): Unit = {
     logInfo("Asked to cancel job group " + groupId)
     eventProcessLoop.post(JobGroupCancelled(groupId))
@@ -731,31 +827,77 @@ class DAGScheduler(
     try {
       // New stage creation may throw an exception if, for example, jobs are run on a
       // HadoopRDD whose underlying HDFS files have been deleted.
-      finalStage = newResultStage(finalRDD, partitions.length, jobId, callSite)
+      finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
     } catch {
       case e: Exception =>
         logWarning("Creating new stage failed due to exception - job: " + jobId, e)
         listener.jobFailed(e)
         return
     }
-    if (finalStage != null) {
-      val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
-      clearCacheLocs()
-      logInfo("Got job %s (%s) with %d output partitions".format(
-        job.jobId, callSite.shortForm, partitions.length))
-      logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
-      logInfo("Parents of final stage: " + finalStage.parents)
-      logInfo("Missing parents: " + getMissingParentStages(finalStage))
-      val jobSubmissionTime = clock.getTimeMillis()
-      jobIdToActiveJob(jobId) = job
-      activeJobs += job
-      finalStage.resultOfJob = Some(job)
-      val stageIds = jobIdToStageIds(jobId).toArray
-      val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
-      listenerBus.post(
-        SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
-      submitStage(finalStage)
+
+    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
+    clearCacheLocs()
+    logInfo("Got job %s (%s) with %d output partitions".format(
+      job.jobId, callSite.shortForm, partitions.length))
+    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
+    logInfo("Parents of final stage: " + finalStage.parents)
+    logInfo("Missing parents: " + getMissingParentStages(finalStage))
+
+    val jobSubmissionTime = clock.getTimeMillis()
+    jobIdToActiveJob(jobId) = job
+    activeJobs += job
+    finalStage.resultOfJob = Some(job)
+    val stageIds = jobIdToStageIds(jobId).toArray
+    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
+    listenerBus.post(
+      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+    submitStage(finalStage)
+
+    submitWaitingStages()
+  }
+
+  private[scheduler] def handleMapStageSubmitted(jobId: Int,
+      dependency: ShuffleDependency[_, _, _],
+      callSite: CallSite,
+      listener: JobListener,
+      properties: Properties) {
+    // Submitting this map stage might still require the creation of some parent stages, so make
+    // sure that happens.
+    var finalStage: ShuffleMapStage = null
+    try {
+      // New stage creation may throw an exception if, for example, jobs are run on a
+      // HadoopRDD whose underlying HDFS files have been deleted.
+      finalStage = getShuffleMapStage(dependency, jobId)
+    } catch {
+      case e: Exception =>
+        logWarning("Creating new stage failed due to exception - job: " + jobId, e)
+        listener.jobFailed(e)
+        return
     }
+
+    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
+    clearCacheLocs()
+    logInfo("Got map stage job %s (%s) with %d output partitions".format(
+      jobId, callSite.shortForm, dependency.rdd.partitions.size))
+    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
+    logInfo("Parents of final stage: " + finalStage.parents)
+    logInfo("Missing parents: " + getMissingParentStages(finalStage))
+
+    val jobSubmissionTime = clock.getTimeMillis()
+    jobIdToActiveJob(jobId) = job
+    activeJobs += job
+    finalStage.mapStageJobs = job :: finalStage.mapStageJobs
+    val stageIds = jobIdToStageIds(jobId).toArray
+    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
+    listenerBus.post(
+      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+    submitStage(finalStage)
+
+    // If the whole stage has already finished, tell the listener and remove it
+    if (!finalStage.outputLocs.contains(Nil)) {
+      markMapStageJobAsFinished(job, mapOutputTracker.getStatistics(dependency))
+    }
+
     submitWaitingStages()
   }
 
@@ -786,7 +928,7 @@ class DAGScheduler(
   private def submitMissingTasks(stage: Stage, jobId: Int) {
     logDebug("submitMissingTasks(" + stage + ")")
     // Get our pending tasks and remember them in our pendingTasks entry
-    stage.pendingTasks.clear()
+    stage.pendingPartitions.clear()
 
     // First figure out the indexes of partition ids to compute.
     val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
@@ -825,7 +967,7 @@ class DAGScheduler(
         case s: ResultStage =>
           val job = s.resultOfJob.get
           partitionsToCompute.map { id =>
-            val p = job.partitions(id)
+            val p = s.partitions(id)
             (id, getPreferredLocs(stage.rdd, p))
           }.toMap
       }
@@ -855,7 +997,7 @@ class DAGScheduler(
         case stage: ShuffleMapStage =>
           closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
         case stage: ResultStage =>
-          closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func): AnyRef).array()
+          closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
       }
 
       taskBinary = sc.broadcast(taskBinaryBytes)
@@ -886,7 +1028,7 @@ class DAGScheduler(
         case stage: ResultStage =>
           val job = stage.resultOfJob.get
           partitionsToCompute.map { id =>
-            val p: Int = job.partitions(id)
+            val p: Int = stage.partitions(id)
             val part = stage.rdd.partitions(p)
             val locs = taskIdToLocations(id)
             new ResultTask(stage.id, stage.latestInfo.attemptId,
@@ -902,8 +1044,8 @@ class DAGScheduler(
 
     if (tasks.size > 0) {
       logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
-      stage.pendingTasks ++= tasks
-      logDebug("New pending tasks: " + stage.pendingTasks)
+      stage.pendingPartitions ++= tasks.map(_.partitionId)
+      logDebug("New pending partitions: " + stage.pendingPartitions)
       taskScheduler.submitTasks(new TaskSet(
         tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
       stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
@@ -970,8 +1112,11 @@ class DAGScheduler(
     val stageId = task.stageId
     val taskType = Utils.getFormattedClassName(task)
 
-    outputCommitCoordinator.taskCompleted(stageId, task.partitionId,
-      event.taskInfo.attempt, event.reason)
+    outputCommitCoordinator.taskCompleted(
+      stageId,
+      task.partitionId,
+      event.taskInfo.attemptNumber, // this is a task attempt number
+      event.reason)
 
     // The success case is dealt with separately below, since we need to compute accumulator
     // updates before posting.
@@ -991,7 +1136,7 @@ class DAGScheduler(
       case Success =>
         listenerBus.post(SparkListenerTaskEnd(stageId, stage.latestInfo.attemptId, taskType,
           event.reason, event.taskInfo, event.taskMetrics))
-        stage.pendingTasks -= task
+        stage.pendingPartitions -= task.partitionId
         task match {
           case rt: ResultTask[_, _] =>
             // Cast to ResultStage here because it's part of the ResultTask
@@ -1037,7 +1182,7 @@ class DAGScheduler(
               shuffleStage.addOutputLoc(smt.partitionId, status)
             }
 
-            if (runningStages.contains(shuffleStage) && shuffleStage.pendingTasks.isEmpty) {
+            if (runningStages.contains(shuffleStage) && shuffleStage.pendingPartitions.isEmpty) {
               markStageAsFinished(shuffleStage)
               logInfo("looking for newly runnable stages")
               logInfo("running: " + runningStages)
@@ -1052,45 +1197,36 @@ class DAGScheduler(
               //       we registered these map outputs.
               mapOutputTracker.registerMapOutputs(
                 shuffleStage.shuffleDep.shuffleId,
-                shuffleStage.outputLocs.map(list => if (list.isEmpty) null else list.head),
+                shuffleStage.outputLocs.map(_.headOption.orNull),
                 changeEpoch = true)
 
               clearCacheLocs()
+
               if (shuffleStage.outputLocs.contains(Nil)) {
                 // Some tasks had failed; let's resubmit this shuffleStage
                 // TODO: Lower-level scheduler should also deal with this
                 logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name +
                   ") because some of its tasks had failed: " +
                   shuffleStage.outputLocs.zipWithIndex.filter(_._1.isEmpty)
-                      .map(_._2).mkString(", "))
+                    .map(_._2).mkString(", "))
                 submitStage(shuffleStage)
               } else {
-                val newlyRunnable = new ArrayBuffer[Stage]
-                for (shuffleStage <- waitingStages) {
-                  logInfo("Missing parents for " + shuffleStage + ": " +
-                    getMissingParentStages(shuffleStage))
-                }
-                for (shuffleStage <- waitingStages if getMissingParentStages(shuffleStage).isEmpty)
-                {
-                  newlyRunnable += shuffleStage
-                }
-                waitingStages --= newlyRunnable
-                runningStages ++= newlyRunnable
-                for {
-                  shuffleStage <- newlyRunnable.sortBy(_.id)
-                  jobId <- activeJobForStage(shuffleStage)
-                } {
-                  logInfo("Submitting " + shuffleStage + " (" +
-                    shuffleStage.rdd + "), which is now runnable")
-                  submitMissingTasks(shuffleStage, jobId)
+                // Mark any map-stage jobs waiting on this stage as finished
+                if (shuffleStage.mapStageJobs.nonEmpty) {
+                  val stats = mapOutputTracker.getStatistics(shuffleStage.shuffleDep)
+                  for (job <- shuffleStage.mapStageJobs) {
+                    markMapStageJobAsFinished(job, stats)
+                  }
                 }
               }
+
+              // Note: newly runnable stages will be submitted below when we submit waiting stages
             }
-          }
+        }
 
       case Resubmitted =>
         logInfo("Resubmitted " + task + ", so marking it as still running")
-        stage.pendingTasks += task
+        stage.pendingPartitions += task.partitionId
 
       case FetchFailed(bmAddress, shuffleId, mapId, reduceId, failureMessage) =>
         val failedStage = stageIdToStage(task.stageId)
@@ -1101,7 +1237,6 @@ class DAGScheduler(
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
             s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
         } else {
-
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
           // possible the fetch failure has already been handled by the scheduler.
@@ -1117,6 +1252,11 @@ class DAGScheduler(
           if (disallowStageRetryForTest) {
             abortStage(failedStage, "Fetch failure will not retry stage due to testing config",
               None)
+          } else if (failedStage.failedOnFetchAndShouldAbort(task.stageAttemptId)) {
+            abortStage(failedStage, s"$failedStage (${failedStage.name}) " +
+              s"has failed the maximum allowable number of " +
+              s"times: ${Stage.MAX_CONSECUTIVE_FETCH_FAILURES}. " +
+              s"Most recent failure reason: ${failureMessage}", None)
           } else if (failedStages.isEmpty) {
             // Don't schedule an event to resubmit failed stages if failed isn't empty, because
             // in that case the event will already have been scheduled.
@@ -1182,7 +1322,7 @@ class DAGScheduler(
         // TODO: This will be really slow if we keep accumulating shuffle map stages
         for ((shuffleId, stage) <- shuffleToMapStage) {
           stage.removeOutputsOnExecutor(execId)
-          val locs = stage.outputLocs.map(list => if (list.isEmpty) null else list.head)
+          val locs = stage.outputLocs.map(_.headOption.orNull)
           mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true)
         }
         if (shuffleToMapStage.isEmpty) {
@@ -1240,10 +1380,17 @@ class DAGScheduler(
     if (errorMessage.isEmpty) {
       logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
       stage.latestInfo.completionTime = Some(clock.getTimeMillis())
+
+      // Clear failure count for this stage, now that it's succeeded.
+      // We only limit consecutive failures of stage attempts,so that if a stage is
+      // re-used many times in a long-running job, unrelated failures don't eventually cause the
+      // stage to be aborted.
+      stage.clearFailures()
     } else {
       stage.latestInfo.stageFailed(errorMessage.get)
       logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime))
     }
+
     outputCommitCoordinator.stageEnd(stage.id)
     listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
     runningStages -= stage
@@ -1407,28 +1554,24 @@ class DAGScheduler(
             return locs
           }
         }
+
       case _ =>
     }
 
-    // If the RDD has shuffle dependencies and shuffle locality is enabled, pick locations that
-    // have at least REDUCER_PREF_LOCS_FRACTION of data as preferred locations
-    if (shuffleLocalityEnabled && rdd.partitions.length < SHUFFLE_PREF_REDUCE_THRESHOLD) {
-      rdd.dependencies.foreach {
-        case s: ShuffleDependency[_, _, _] =>
-          if (s.rdd.partitions.length < SHUFFLE_PREF_MAP_THRESHOLD) {
-            // Get the preferred map output locations for this reducer
-            val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId,
-              partition, rdd.partitions.length, REDUCER_PREF_LOCS_FRACTION)
-            if (topLocsForReducer.nonEmpty) {
-              return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId))
-            }
-          }
-        case _ =>
-      }
-    }
     Nil
   }
 
+  /** Mark a map stage job as finished with the given output stats, and report to its listener. */
+  def markMapStageJobAsFinished(job: ActiveJob, stats: MapOutputStatistics): Unit = {
+    // In map stage jobs, we only create a single "task", which is to finish all of the stage
+    // (including reusing any previous map outputs, etc); so we just mark task 0 as done
+    job.finished(0) = true
+    job.numFinished += 1
+    job.listener.taskSucceeded(0, stats)
+    cleanupStateForJobAndIndependentStages(job)
+    listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded))
+  }
+
   def stop() {
     logInfo("Stopping DAGScheduler")
     messageScheduler.shutdownNow()
@@ -1462,6 +1605,9 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
       dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
 
+    case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
+      dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
+
     case StageCancelled(stageId) =>
       dagScheduler.handleStageCancellation(stageId)
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
index f72a52e85dc15..dda3b6cc7f960 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -35,6 +35,7 @@ import org.apache.spark.util.CallSite
  */
 private[scheduler] sealed trait DAGSchedulerEvent
 
+/** A result-yielding job was submitted on a target RDD */
 private[scheduler] case class JobSubmitted(
     jobId: Int,
     finalRDD: RDD[_],
@@ -45,6 +46,15 @@ private[scheduler] case class JobSubmitted(
     properties: Properties = null)
   extends DAGSchedulerEvent
 
+/** A map stage as submitted to run as a separate job */
+private[scheduler] case class MapStageSubmitted(
+  jobId: Int,
+  dependency: ShuffleDependency[_, _, _],
+  callSite: CallSite,
+  listener: JobListener,
+  properties: Properties = null)
+  extends DAGSchedulerEvent
+
 private[scheduler] case class StageCancelled(stageId: Int) extends DAGSchedulerEvent
 
 private[scheduler] case class JobCancelled(jobId: Int) extends DAGSchedulerEvent
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 5a06ef02f5c57..000a021a528cf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -109,7 +109,9 @@ private[spark] class EventLoggingListener(
 
     if (shouldOverwrite && fileSystem.exists(path)) {
       logWarning(s"Event log $path already exists. Overwriting...")
-      fileSystem.delete(path, true)
+      if (!fileSystem.delete(path, true)) {
+        logWarning(s"Error deleting $path")
+      }
     }
 
     /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844).
@@ -216,7 +218,9 @@ private[spark] class EventLoggingListener(
     if (fileSystem.exists(target)) {
       if (shouldOverwrite) {
         logWarning(s"Event log $target already exists. Overwriting...")
-        fileSystem.delete(target, true)
+        if (!fileSystem.delete(target, true)) {
+          logWarning(s"Error deleting $target")
+        }
       } else {
         throw new IOException("Target log file already exists (%s)".format(logPath))
       }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
index 2bc43a9186449..0a98c69b89ea5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala
@@ -23,16 +23,20 @@ import org.apache.spark.executor.ExecutorExitCode
  * Represents an explanation for a executor or whole slave failing or exiting.
  */
 private[spark]
-class ExecutorLossReason(val message: String) {
+class ExecutorLossReason(val message: String) extends Serializable {
   override def toString: String = message
 }
 
 private[spark]
-case class ExecutorExited(val exitCode: Int)
-  extends ExecutorLossReason(ExecutorExitCode.explainExitCode(exitCode)) {
+case class ExecutorExited(exitCode: Int, isNormalExit: Boolean, reason: String)
+  extends ExecutorLossReason(reason)
+
+private[spark] object ExecutorExited {
+  def apply(exitCode: Int, isNormalExit: Boolean): ExecutorExited = {
+    ExecutorExited(exitCode, isNormalExit, ExecutorExitCode.explainExitCode(exitCode))
+  }
 }
 
 private[spark]
 case class SlaveLost(_message: String = "Slave lost")
-  extends ExecutorLossReason(_message) {
-}
+  extends ExecutorLossReason(_message)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
index 5d926377ce86b..add0dedc03f44 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
@@ -25,7 +25,7 @@ import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, RpcEndpoint
 private sealed trait OutputCommitCoordinationMessage extends Serializable
 
 private case object StopCoordinator extends OutputCommitCoordinationMessage
-private case class AskPermissionToCommitOutput(stage: Int, task: Long, taskAttempt: Long)
+private case class AskPermissionToCommitOutput(stage: Int, partition: Int, attemptNumber: Int)
 
 /**
  * Authority that decides whether tasks can commit output to HDFS. Uses a "first committer wins"
@@ -44,8 +44,8 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   var coordinatorRef: Option[RpcEndpointRef] = None
 
   private type StageId = Int
-  private type PartitionId = Long
-  private type TaskAttemptId = Long
+  private type PartitionId = Int
+  private type TaskAttemptNumber = Int
 
   /**
    * Map from active stages's id => partition id => task attempt with exclusive lock on committing
@@ -57,7 +57,8 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
    * Access to this map should be guarded by synchronizing on the OutputCommitCoordinator instance.
    */
   private val authorizedCommittersByStage: CommittersByStageMap = mutable.Map()
-  private type CommittersByStageMap = mutable.Map[StageId, mutable.Map[PartitionId, TaskAttemptId]]
+  private type CommittersByStageMap =
+    mutable.Map[StageId, mutable.Map[PartitionId, TaskAttemptNumber]]
 
   /**
    * Returns whether the OutputCommitCoordinator's internal data structures are all empty.
@@ -75,14 +76,15 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
    *
    * @param stage the stage number
    * @param partition the partition number
-   * @param attempt a unique identifier for this task attempt
+   * @param attemptNumber how many times this task has been attempted
+   *                      (see [[TaskContext.attemptNumber()]])
    * @return true if this task is authorized to commit, false otherwise
    */
   def canCommit(
       stage: StageId,
       partition: PartitionId,
-      attempt: TaskAttemptId): Boolean = {
-    val msg = AskPermissionToCommitOutput(stage, partition, attempt)
+      attemptNumber: TaskAttemptNumber): Boolean = {
+    val msg = AskPermissionToCommitOutput(stage, partition, attemptNumber)
     coordinatorRef match {
       case Some(endpointRef) =>
         endpointRef.askWithRetry[Boolean](msg)
@@ -95,7 +97,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
 
   // Called by DAGScheduler
   private[scheduler] def stageStart(stage: StageId): Unit = synchronized {
-    authorizedCommittersByStage(stage) = mutable.HashMap[PartitionId, TaskAttemptId]()
+    authorizedCommittersByStage(stage) = mutable.HashMap[PartitionId, TaskAttemptNumber]()
   }
 
   // Called by DAGScheduler
@@ -107,7 +109,7 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   private[scheduler] def taskCompleted(
       stage: StageId,
       partition: PartitionId,
-      attempt: TaskAttemptId,
+      attemptNumber: TaskAttemptNumber,
       reason: TaskEndReason): Unit = synchronized {
     val authorizedCommitters = authorizedCommittersByStage.getOrElse(stage, {
       logDebug(s"Ignoring task completion for completed stage")
@@ -117,12 +119,12 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
       case Success =>
       // The task output has been committed successfully
       case denied: TaskCommitDenied =>
-        logInfo(
-          s"Task was denied committing, stage: $stage, partition: $partition, attempt: $attempt")
+        logInfo(s"Task was denied committing, stage: $stage, partition: $partition, " +
+          s"attempt: $attemptNumber")
       case otherReason =>
-        if (authorizedCommitters.get(partition).exists(_ == attempt)) {
-          logDebug(s"Authorized committer $attempt (stage=$stage, partition=$partition) failed;" +
-            s" clearing lock")
+        if (authorizedCommitters.get(partition).exists(_ == attemptNumber)) {
+          logDebug(s"Authorized committer (attemptNumber=$attemptNumber, stage=$stage, " +
+            s"partition=$partition) failed; clearing lock")
           authorizedCommitters.remove(partition)
         }
     }
@@ -140,21 +142,23 @@ private[spark] class OutputCommitCoordinator(conf: SparkConf, isDriver: Boolean)
   private[scheduler] def handleAskPermissionToCommit(
       stage: StageId,
       partition: PartitionId,
-      attempt: TaskAttemptId): Boolean = synchronized {
+      attemptNumber: TaskAttemptNumber): Boolean = synchronized {
     authorizedCommittersByStage.get(stage) match {
       case Some(authorizedCommitters) =>
         authorizedCommitters.get(partition) match {
           case Some(existingCommitter) =>
-            logDebug(s"Denying $attempt to commit for stage=$stage, partition=$partition; " +
-              s"existingCommitter = $existingCommitter")
+            logDebug(s"Denying attemptNumber=$attemptNumber to commit for stage=$stage, " +
+              s"partition=$partition; existingCommitter = $existingCommitter")
             false
           case None =>
-            logDebug(s"Authorizing $attempt to commit for stage=$stage, partition=$partition")
-            authorizedCommitters(partition) = attempt
+            logDebug(s"Authorizing attemptNumber=$attemptNumber to commit for stage=$stage, " +
+              s"partition=$partition")
+            authorizedCommitters(partition) = attemptNumber
             true
         }
       case None =>
-        logDebug(s"Stage $stage has completed, so not allowing task attempt $attempt to commit")
+        logDebug(s"Stage $stage has completed, so not allowing attempt number $attemptNumber of" +
+          s"partition $partition to commit")
         false
     }
   }
@@ -174,9 +178,9 @@ private[spark] object OutputCommitCoordinator {
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-      case AskPermissionToCommitOutput(stage, partition, taskAttempt) =>
+      case AskPermissionToCommitOutput(stage, partition, attemptNumber) =>
         context.reply(
-          outputCommitCoordinator.handleAskPermissionToCommit(stage, partition, taskAttempt))
+          outputCommitCoordinator.handleAskPermissionToCommit(stage, partition, attemptNumber))
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
index 5821afea98982..551e39a81b695 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
@@ -83,8 +83,8 @@ private[spark] class Pool(
     null
   }
 
-  override def executorLost(executorId: String, host: String) {
-    schedulableQueue.asScala.foreach(_.executorLost(executorId, host))
+  override def executorLost(executorId: String, host: String, reason: ExecutorLossReason) {
+    schedulableQueue.asScala.foreach(_.executorLost(executorId, host, reason))
   }
 
   override def checkSpeculatableTasks(): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index bf81b9aca4810..c0451da1f0247 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -17,23 +17,30 @@
 
 package org.apache.spark.scheduler
 
+import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.CallSite
 
 /**
- * The ResultStage represents the final stage in a job.
+ * ResultStages apply a function on some partitions of an RDD to compute the result of an action.
+ * The ResultStage object captures the function to execute, `func`, which will be applied to each
+ * partition, and the set of partition IDs, `partitions`. Some stages may not run on all partitions
+ * of the RDD, for actions like first() and lookup().
  */
 private[spark] class ResultStage(
     id: Int,
     rdd: RDD[_],
-    numTasks: Int,
+    val func: (TaskContext, Iterator[_]) => _,
+    val partitions: Array[Int],
     parents: List[Stage],
     firstJobId: Int,
     callSite: CallSite)
-  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
+  extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite) {
 
-  // The active job for this result stage. Will be empty if the job has already finished
-  // (e.g., because the job was cancelled).
+  /**
+   * The active job for this result stage. Will be empty if the job has already finished
+   * (e.g., because the job was cancelled).
+   */
   var resultOfJob: Option[ActiveJob] = None
 
   override def toString: String = "ResultStage " + id
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index c4dc080e2b22b..fb693721a9cb6 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -44,7 +44,7 @@ private[spark] class ResultTask[T, U](
     stageAttemptId: Int,
     taskBinary: Broadcast[Array[Byte]],
     partition: Partition,
-    @transient locs: Seq[TaskLocation],
+    locs: Seq[TaskLocation],
     val outputId: Int,
     internalAccumulators: Seq[Accumulator[Long]])
   extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
index a87ef030e69c2..ab00bc8f0bf4e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala
@@ -42,7 +42,7 @@ private[spark] trait Schedulable {
   def addSchedulable(schedulable: Schedulable): Unit
   def removeSchedulable(schedulable: Schedulable): Unit
   def getSchedulableByName(name: String): Schedulable
-  def executorLost(executorId: String, host: String): Unit
+  def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit
   def checkSpeculatableTasks(): Boolean
   def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager]
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 48d8d8e9c4b78..7d92960876403 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -23,7 +23,15 @@ import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.CallSite
 
 /**
- * The ShuffleMapStage represents the intermediate stages in a job.
+ * ShuffleMapStages are intermediate stages in the execution DAG that produce data for a shuffle.
+ * They occur right before each shuffle operation, and might contain multiple pipelined operations
+ * before that (e.g. map and filter). When executed, they save map output files that can later be
+ * fetched by reduce tasks. The `shuffleDep` field describes the shuffle each stage is part of,
+ * and variables like `outputLocs` and `numAvailableOutputs` track how many map outputs are ready.
+ *
+ * ShuffleMapStages can also be submitted independently as jobs with DAGScheduler.submitMapStage.
+ * For such stages, the ActiveJobs that submitted them are tracked in `mapStageJobs`. Note that
+ * there can be multiple ActiveJobs trying to compute the same shuffle map stage.
  */
 private[spark] class ShuffleMapStage(
     id: Int,
@@ -37,6 +45,9 @@ private[spark] class ShuffleMapStage(
 
   override def toString: String = "ShuffleMapStage " + id
 
+  /** Running map-stage jobs that were submitted to execute this stage independently (if any) */
+  var mapStageJobs: List[ActiveJob] = Nil
+
   var numAvailableOutputs: Int = 0
 
   def isAvailable: Boolean = numAvailableOutputs == numPartitions
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index 1cf06856ffbc2..a3829c319c48d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -24,29 +24,35 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.CallSite
 
 /**
- * A stage is a set of independent tasks all computing the same function that need to run as part
+ * A stage is a set of parallel tasks all computing the same function that need to run as part
  * of a Spark job, where all the tasks have the same shuffle dependencies. Each DAG of tasks run
  * by the scheduler is split up into stages at the boundaries where shuffle occurs, and then the
  * DAGScheduler runs these stages in topological order.
  *
  * Each Stage can either be a shuffle map stage, in which case its tasks' results are input for
- * another stage, or a result stage, in which case its tasks directly compute the action that
- * initiated a job (e.g. count(), save(), etc). For shuffle map stages, we also track the nodes
- * that each output partition is on.
+ * other stage(s), or a result stage, in which case its tasks directly compute a Spark action
+ * (e.g. count(), save(), etc) by running a function on an RDD. For shuffle map stages, we also
+ * track the nodes that each output partition is on.
  *
  * Each Stage also has a firstJobId, identifying the job that first submitted the stage.  When FIFO
  * scheduling is used, this allows Stages from earlier jobs to be computed first or recovered
  * faster on failure.
  *
- * The callSite provides a location in user code which relates to the stage. For a shuffle map
- * stage, the callSite gives the user code that created the RDD being shuffled. For a result
- * stage, the callSite gives the user code that executes the associated action (e.g. count()).
- *
- * A single stage can consist of multiple attempts. In that case, the latestInfo field will
- * be updated for each attempt.
+ * Finally, a single stage can be re-executed in multiple attempts due to fault recovery. In that
+ * case, the Stage object will track multiple StageInfo objects to pass to listeners or the web UI.
+ * The latest one will be accessible through latestInfo.
  *
+ * @param id Unique stage ID
+ * @param rdd RDD that this stage runs on: for a shuffle map stage, it's the RDD we run map tasks
+ *   on, while for a result stage, it's the target RDD that we ran an action on
+ * @param numTasks Total number of tasks in stage; result stages in particular may not need to
+ *   compute all partitions, e.g. for first(), lookup(), and take().
+ * @param parents List of stages that this stage depends on (through shuffle dependencies).
+ * @param firstJobId ID of the first job this stage was part of, for FIFO scheduling.
+ * @param callSite Location in the user program associated with this stage: either where the target
+ *   RDD was created, for a shuffle map stage, or where the action for a result stage was called.
  */
-private[spark] abstract class Stage(
+private[scheduler] abstract class Stage(
     val id: Int,
     val rdd: RDD[_],
     val numTasks: Int,
@@ -60,7 +66,7 @@ private[spark] abstract class Stage(
   /** Set of jobs that this stage belongs to. */
   val jobIds = new HashSet[Int]
 
-  var pendingTasks = new HashSet[Task[_]]
+  val pendingPartitions = new HashSet[Int]
 
   /** The ID to use for the next new attempt for this stage. */
   private var nextAttemptId: Int = 0
@@ -92,6 +98,29 @@ private[spark] abstract class Stage(
    */
   private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
 
+  /**
+   * Set of stage attempt IDs that have failed with a FetchFailure. We keep track of these
+   * failures in order to avoid endless retries if a stage keeps failing with a FetchFailure.
+   * We keep track of each attempt ID that has failed to avoid recording duplicate failures if
+   * multiple tasks from the same stage attempt fail (SPARK-5945).
+   */
+  private val fetchFailedAttemptIds = new HashSet[Int]
+
+  private[scheduler] def clearFailures() : Unit = {
+    fetchFailedAttemptIds.clear()
+  }
+
+  /**
+   * Check whether we should abort the failedStage due to multiple consecutive fetch failures.
+   *
+   * This method updates the running set of failed stage attempts and returns
+   * true if the number of failures exceeds the allowable number of failures.
+   */
+  private[scheduler] def failedOnFetchAndShouldAbort(stageAttemptId: Int): Boolean = {
+    fetchFailedAttemptIds.add(stageAttemptId)
+    fetchFailedAttemptIds.size >= Stage.MAX_CONSECUTIVE_FETCH_FAILURES
+  }
+
   /** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */
   def makeNewStageAttempt(
       numPartitionsToCompute: Int,
@@ -110,3 +139,8 @@ private[spark] abstract class Stage(
     case _ => false
   }
 }
+
+private[scheduler] object Stage {
+  // The number of consecutive failures allowed before a stage is aborted
+  val MAX_CONSECUTIVE_FETCH_FAILURES = 4
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
index 132a9ced77700..f113c2b1b8433 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala
@@ -29,7 +29,7 @@ import org.apache.spark.annotation.DeveloperApi
 class TaskInfo(
     val taskId: Long,
     val index: Int,
-    val attempt: Int,
+    val attemptNumber: Int,
     val launchTime: Long,
     val executorId: String,
     val host: String,
@@ -95,7 +95,10 @@ class TaskInfo(
     }
   }
 
-  def id: String = s"$index.$attempt"
+  @deprecated("Use attemptNumber", "1.6.0")
+  def attempt: Int = attemptNumber
+
+  def id: String = s"$index.$attemptNumber"
 
   def duration: Long = {
     if (!finished) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 1705e7f962de2..1c7bfe89c02ac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -332,7 +332,8 @@ private[spark] class TaskSchedulerImpl(
           // We lost this entire executor, so remember that it's gone
           val execId = taskIdToExecutorId(tid)
           if (activeExecutorIds.contains(execId)) {
-            removeExecutor(execId)
+            removeExecutor(execId,
+              SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
             failedExecutor = Some(execId)
           }
         }
@@ -464,7 +465,7 @@ private[spark] class TaskSchedulerImpl(
       if (activeExecutorIds.contains(executorId)) {
         val hostPort = executorIdToHost(executorId)
         logError("Lost executor %s on %s: %s".format(executorId, hostPort, reason))
-        removeExecutor(executorId)
+        removeExecutor(executorId, reason)
         failedExecutor = Some(executorId)
       } else {
          // We may get multiple executorLost() calls with different loss reasons. For example, one
@@ -482,7 +483,7 @@ private[spark] class TaskSchedulerImpl(
   }
 
   /** Remove an executor from all our data structures and mark it as lost */
-  private def removeExecutor(executorId: String) {
+  private def removeExecutor(executorId: String, reason: ExecutorLossReason) {
     activeExecutorIds -= executorId
     val host = executorIdToHost(executorId)
     val execs = executorsByHost.getOrElse(host, new HashSet)
@@ -497,7 +498,7 @@ private[spark] class TaskSchedulerImpl(
       }
     }
     executorIdToHost -= executorId
-    rootPool.executorLost(executorId, host)
+    rootPool.executorLost(executorId, host, reason)
   }
 
   def executorAdded(execId: String, host: String) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 818b95d67f6be..c02597c4365c9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -487,8 +487,8 @@ private[spark] class TaskSetManager(
           // a good proxy to task serialization time.
           // val timeTaken = clock.getTime() - startTime
           val taskName = s"task ${info.id} in stage ${taskSet.id}"
-          logInfo("Starting %s (TID %d, %s, %s, %d bytes)".format(
-              taskName, taskId, host, taskLocality, serializedTask.limit))
+          logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," +
+            s"$taskLocality, ${serializedTask.limit} bytes)")
 
           sched.dagScheduler.taskStarted(task, info)
           return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
@@ -709,6 +709,11 @@ private[spark] class TaskSetManager(
         }
         ef.exception
 
+      case e: ExecutorLostFailure if e.isNormalExit =>
+        logInfo(s"Task $tid failed because while it was being computed, its executor" +
+          s" exited normally. Not marking the task as failed.")
+        None
+
       case e: TaskFailedReason =>  // TaskResultLost, TaskKilled, and others
         logWarning(failureReason)
         None
@@ -722,10 +727,9 @@ private[spark] class TaskSetManager(
       put(info.executorId, clock.getTimeMillis())
     sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics)
     addPendingTask(index)
-    if (!isZombie && state != TaskState.KILLED && !reason.isInstanceOf[TaskCommitDenied]) {
-      // If a task failed because its attempt to commit was denied, do not count this failure
-      // towards failing the stage. This is intended to prevent spurious stage failures in cases
-      // where many speculative tasks are launched and denied to commit.
+    if (!isZombie && state != TaskState.KILLED
+        && reason.isInstanceOf[TaskFailedReason]
+        && reason.asInstanceOf[TaskFailedReason].shouldEventuallyFailJob) {
       assert (null != failureReason)
       numFailures(index) += 1
       if (numFailures(index) >= maxTaskFailures) {
@@ -778,7 +782,7 @@ private[spark] class TaskSetManager(
   }
 
   /** Called by TaskScheduler when an executor is lost so we can re-enqueue our tasks */
-  override def executorLost(execId: String, host: String) {
+  override def executorLost(execId: String, host: String, reason: ExecutorLossReason) {
     logInfo("Re-queueing tasks for " + execId + " from TaskSet " + taskSet.id)
 
     // Re-enqueue pending tasks for this host based on the status of the cluster. Note
@@ -809,9 +813,12 @@ private[spark] class TaskSetManager(
         }
       }
     }
-    // Also re-enqueue any tasks that were running on the node
     for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
-      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(execId))
+      val isNormalExit: Boolean = reason match {
+        case exited: ExecutorExited => exited.isNormalExit
+        case _ => false
+      }
+      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, isNormalExit))
     }
     // recalculate valid locality levels and waits when executor is lost
     recomputeLocality()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 06f5438433b6e..d94743677783f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer
 
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.scheduler.ExecutorLossReason
 import org.apache.spark.util.{SerializableBuffer, Utils}
 
 private[spark] sealed trait CoarseGrainedClusterMessage extends Serializable
@@ -70,7 +71,8 @@ private[spark] object CoarseGrainedClusterMessages {
 
   case object StopExecutors extends CoarseGrainedClusterMessage
 
-  case class RemoveExecutor(executorId: String, reason: String) extends CoarseGrainedClusterMessage
+  case class RemoveExecutor(executorId: String, reason: ExecutorLossReason)
+    extends CoarseGrainedClusterMessage
 
   case class SetupDriver(driver: RpcEndpointRef) extends CoarseGrainedClusterMessage
 
@@ -92,6 +94,10 @@ private[spark] object CoarseGrainedClusterMessages {
       hostToLocalTaskCount: Map[String, Int])
     extends CoarseGrainedClusterMessage
 
+  // Check if an executor was force-killed but for a normal reason.
+  // This could be the case if the executor is preempted, for instance.
+  case class GetExecutorLossReason(executorId: String) extends CoarseGrainedClusterMessage
+
   case class KillExecutors(executorIds: Seq[String]) extends CoarseGrainedClusterMessage
 
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 5730a87f960a0..18771f79b44bb 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -26,6 +26,7 @@ import org.apache.spark.rpc._
 import org.apache.spark.{ExecutorAllocationClient, Logging, SparkEnv, SparkException, TaskState}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.ENDPOINT_NAME
 import org.apache.spark.util.{ThreadUtils, SerializableBuffer, AkkaUtils, Utils}
 
 /**
@@ -82,7 +83,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
     override protected def log = CoarseGrainedSchedulerBackend.this.log
 
-    private val addressToExecutorId = new HashMap[RpcAddress, String]
+    protected val addressToExecutorId = new HashMap[RpcAddress, String]
 
     private val reviveThread =
       ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")
@@ -128,6 +129,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+
       case RegisterExecutor(executorId, executorRef, hostPort, cores, logUrls) =>
         Utils.checkHostPort(hostPort, "Host port expected " + hostPort)
         if (executorDataMap.contains(executorId)) {
@@ -185,8 +187,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
-      addressToExecutorId.get(remoteAddress).foreach(removeExecutor(_,
-        "remote Rpc client disassociated"))
+      addressToExecutorId
+        .get(remoteAddress)
+        .foreach(removeExecutor(_, SlaveLost("remote Rpc client disassociated")))
     }
 
     // Make fake resource offers on just one executor
@@ -227,7 +230,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     // Remove a disconnected slave from the cluster
-    def removeExecutor(executorId: String, reason: String): Unit = {
+    def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
       executorDataMap.get(executorId) match {
         case Some(executorInfo) =>
           // This must be synchronized because variables mutated
@@ -239,9 +242,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           }
           totalCoreCount.addAndGet(-executorInfo.totalCores)
           totalRegisteredExecutors.addAndGet(-1)
-          scheduler.executorLost(executorId, SlaveLost(reason))
+          scheduler.executorLost(executorId, reason)
           listenerBus.post(
-            SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason))
+            SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason.toString))
         case None => logInfo(s"Asked to remove non-existent executor $executorId")
       }
     }
@@ -263,8 +266,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     // TODO (prashant) send conf instead of properties
-    driverEndpoint = rpcEnv.setupEndpoint(
-      CoarseGrainedSchedulerBackend.ENDPOINT_NAME, new DriverEndpoint(rpcEnv, properties))
+    driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
+  }
+
+  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
+    new DriverEndpoint(rpcEnv, properties)
   }
 
   def stopExecutors() {
@@ -304,7 +310,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   }
 
   // Called by subclasses when notified of a lost worker
-  def removeExecutor(executorId: String, reason: String) {
+  def removeExecutor(executorId: String, reason: ExecutorLossReason) {
     try {
       driverEndpoint.askWithRetry[Boolean](RemoveExecutor(executorId, reason))
     } catch {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
index 0324c9dab910b..641638a77d5f5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
@@ -65,7 +65,9 @@ private[spark] class SimrSchedulerBackend(
   override def stop() {
     val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
     val fs = FileSystem.get(conf)
-    fs.delete(new Path(driverFilePath), false)
+    if (!fs.delete(new Path(driverFilePath), false)) {
+      logWarning(s"error deleting ${driverFilePath}")
+    }
     super.stop()
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index bbe51b4a09a22..27491ecf8b97d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -23,7 +23,7 @@ import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.client.{AppClient, AppClientListener}
-import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason, SlaveLost, TaskSchedulerImpl}
+import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
 private[spark] class SparkDeploySchedulerBackend(
@@ -135,11 +135,11 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def executorRemoved(fullId: String, message: String, exitStatus: Option[Int]) {
     val reason: ExecutorLossReason = exitStatus match {
-      case Some(code) => ExecutorExited(code)
+      case Some(code) => ExecutorExited(code, isNormalExit = true, message)
       case None => SlaveLost(message)
     }
     logInfo("Executor %s removed: %s".format(fullId, message))
-    removeExecutor(fullId.split("/")(1), reason.toString)
+    removeExecutor(fullId.split("/")(1), reason)
   }
 
   override def sufficientResourcesRegistered(): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 044f6288fabdd..6a4b536dee191 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.scheduler.cluster
 
+import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.{Future, ExecutionContext}
 
 import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
-import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler._
 import org.apache.spark.ui.JettyUtils
 import org.apache.spark.util.{ThreadUtils, RpcUtils}
 
@@ -43,8 +44,10 @@ private[spark] abstract class YarnSchedulerBackend(
 
   protected var totalExpectedExecutors = 0
 
-  private val yarnSchedulerEndpoint = rpcEnv.setupEndpoint(
-    YarnSchedulerBackend.ENDPOINT_NAME, new YarnSchedulerEndpoint(rpcEnv))
+  private val yarnSchedulerEndpoint = new YarnSchedulerEndpoint(rpcEnv)
+
+  private val yarnSchedulerEndpointRef = rpcEnv.setupEndpoint(
+    YarnSchedulerBackend.ENDPOINT_NAME, yarnSchedulerEndpoint)
 
   private implicit val askTimeout = RpcUtils.askRpcTimeout(sc.conf)
 
@@ -53,7 +56,7 @@ private[spark] abstract class YarnSchedulerBackend(
    * This includes executors already pending or running.
    */
   override def doRequestTotalExecutors(requestedTotal: Int): Boolean = {
-    yarnSchedulerEndpoint.askWithRetry[Boolean](
+    yarnSchedulerEndpointRef.askWithRetry[Boolean](
       RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount))
   }
 
@@ -61,7 +64,7 @@ private[spark] abstract class YarnSchedulerBackend(
    * Request that the ApplicationMaster kill the specified executors.
    */
   override def doKillExecutors(executorIds: Seq[String]): Boolean = {
-    yarnSchedulerEndpoint.askWithRetry[Boolean](KillExecutors(executorIds))
+    yarnSchedulerEndpointRef.askWithRetry[Boolean](KillExecutors(executorIds))
   }
 
   override def sufficientResourcesRegistered(): Boolean = {
@@ -90,6 +93,41 @@ private[spark] abstract class YarnSchedulerBackend(
     }
   }
 
+  override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
+    new YarnDriverEndpoint(rpcEnv, properties)
+  }
+
+  /**
+   * Override the DriverEndpoint to add extra logic for the case when an executor is disconnected.
+   * This endpoint communicates with the executors and queries the AM for an executor's exit
+   * status when the executor is disconnected.
+   */
+  private class YarnDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
+      extends DriverEndpoint(rpcEnv, sparkProperties) {
+
+    /**
+     * When onDisconnected is received at the driver endpoint, the superclass DriverEndpoint
+     * handles it by assuming the Executor was lost for a bad reason and removes the executor
+     * immediately.
+     *
+     * In YARN's case however it is crucial to talk to the application master and ask why the
+     * executor had exited. In particular, the executor may have exited due to the executor
+     * having been preempted. If the executor "exited normally" according to the application
+     * master then we pass that information down to the TaskSetManager to inform the
+     * TaskSetManager that tasks on that lost executor should not count towards a job failure.
+     *
+     * TODO there's a race condition where while we are querying the ApplicationMaster for
+     * the executor loss reason, there is the potential that tasks will be scheduled on
+     * the executor that failed. We should fix this by having this onDisconnected event
+     * also "blacklist" executors so that tasks are not assigned to them.
+     */
+    override def onDisconnected(rpcAddress: RpcAddress): Unit = {
+      addressToExecutorId.get(rpcAddress).foreach { executorId =>
+        yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+      }
+    }
+  }
+
   /**
    * An [[RpcEndpoint]] that communicates with the ApplicationMaster.
    */
@@ -101,6 +139,33 @@ private[spark] abstract class YarnSchedulerBackend(
       ThreadUtils.newDaemonCachedThreadPool("yarn-scheduler-ask-am-thread-pool")
     implicit val askAmExecutor = ExecutionContext.fromExecutor(askAmThreadPool)
 
+    private[YarnSchedulerBackend] def handleExecutorDisconnectedFromDriver(
+        executorId: String,
+        executorRpcAddress: RpcAddress): Unit = {
+      amEndpoint match {
+        case Some(am) =>
+          val lossReasonRequest = GetExecutorLossReason(executorId)
+          val future = am.ask[ExecutorLossReason](lossReasonRequest, askTimeout)
+          future onSuccess {
+            case reason: ExecutorLossReason => {
+              driverEndpoint.askWithRetry[Boolean](RemoveExecutor(executorId, reason))
+            }
+          }
+          future onFailure {
+            case NonFatal(e) => {
+              logWarning(s"Attempted to get executor loss reason" +
+                s" for executor id ${executorId} at RPC address ${executorRpcAddress}," +
+                s" but got no response. Marking as slave lost.", e)
+              driverEndpoint.askWithRetry[Boolean](RemoveExecutor(executorId, SlaveLost()))
+            }
+            case t => throw t
+          }
+        case None =>
+          logWarning("Attempted to check for an executor loss reason" +
+            " before the AM has registered!")
+      }
+    }
+
     override def receive: PartialFunction[Any, Unit] = {
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
@@ -113,6 +178,7 @@ private[spark] abstract class YarnSchedulerBackend(
         removeExecutor(executorId, reason)
     }
 
+
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       case r: RequestExecutors =>
         amEndpoint match {
@@ -143,7 +209,6 @@ private[spark] abstract class YarnSchedulerBackend(
             logWarning("Attempted to kill executors before the AM has registered!")
             context.reply(false)
         }
-
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 452c32d5411cd..65cb5016cfcc9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -32,7 +32,7 @@ import org.apache.spark.{SecurityManager, SparkContext, SparkEnv, SparkException
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
 import org.apache.spark.rpc.RpcAddress
-import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler.{SlaveLost, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.util.Utils
 
@@ -222,6 +222,10 @@ private[spark] class CoarseMesosSchedulerBackend(
     markRegistered()
   }
 
+  override def sufficientResourcesRegistered(): Boolean = {
+    totalCoresAcquired >= maxCores * minRegisteredRatio
+  }
+
   override def disconnected(d: SchedulerDriver) {}
 
   override def reregistered(d: SchedulerDriver, masterInfo: MasterInfo) {}
@@ -364,7 +368,7 @@ private[spark] class CoarseMesosSchedulerBackend(
         if (slaveIdToTaskId.containsKey(slaveId)) {
           val taskId: Int = slaveIdToTaskId.get(slaveId)
           taskIdToSlaveId.remove(taskId)
-          removeExecutor(sparkExecutorId(slaveId, taskId.toString), reason)
+          removeExecutor(sparkExecutorId(slaveId, taskId.toString), SlaveLost(reason))
         }
         // TODO: This assumes one Spark executor per Mesos slave,
         // which may no longer be true after SPARK-5095
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 07da9242b9922..a6d9374eb9e8c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -29,7 +29,6 @@ import org.apache.mesos.Protos.Environment.Variable
 import org.apache.mesos.Protos.TaskStatus.Reason
 import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
 import org.apache.mesos.{Scheduler, SchedulerDriver}
-
 import org.apache.spark.deploy.mesos.MesosDriverDescription
 import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionResponse, SubmissionStatusResponse}
 import org.apache.spark.metrics.MetricsSystem
@@ -375,21 +374,20 @@ private[spark] class MesosClusterScheduler(
     val executorOpts = desc.schedulerProperties.map { case (k, v) => s"-D$k=$v" }.mkString(" ")
     envBuilder.addVariables(
       Variable.newBuilder().setName("SPARK_EXECUTOR_OPTS").setValue(executorOpts))
-    val cmdOptions = generateCmdOption(desc).mkString(" ")
     val dockerDefined = desc.schedulerProperties.contains("spark.mesos.executor.docker.image")
     val executorUri = desc.schedulerProperties.get("spark.executor.uri")
       .orElse(desc.command.environment.get("SPARK_EXECUTOR_URI"))
-    val appArguments = desc.command.arguments.mkString(" ")
-    val (executable, jar) = if (dockerDefined) {
+    // Gets the path to run spark-submit, and the path to the Mesos sandbox.
+    val (executable, sandboxPath) = if (dockerDefined) {
       // Application jar is automatically downloaded in the mounted sandbox by Mesos,
       // and the path to the mounted volume is stored in $MESOS_SANDBOX env variable.
-      ("./bin/spark-submit", s"$$MESOS_SANDBOX/${desc.jarUrl.split("/").last}")
+      ("./bin/spark-submit", "$MESOS_SANDBOX")
     } else if (executorUri.isDefined) {
       builder.addUris(CommandInfo.URI.newBuilder().setValue(executorUri.get).build())
       val folderBasename = executorUri.get.split('/').last.split('.').head
       val cmdExecutable = s"cd $folderBasename*; $prefixEnv bin/spark-submit"
-      val cmdJar = s"../${desc.jarUrl.split("/").last}"
-      (cmdExecutable, cmdJar)
+      // Sandbox path points to the parent folder as we chdir into the folderBasename.
+      (cmdExecutable, "..")
     } else {
       val executorSparkHome = desc.schedulerProperties.get("spark.mesos.executor.home")
         .orElse(conf.getOption("spark.home"))
@@ -398,30 +396,50 @@ private[spark] class MesosClusterScheduler(
           throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
         }
       val cmdExecutable = new File(executorSparkHome, "./bin/spark-submit").getCanonicalPath
-      val cmdJar = desc.jarUrl.split("/").last
-      (cmdExecutable, cmdJar)
+      // Sandbox points to the current directory by default with Mesos.
+      (cmdExecutable, ".")
     }
-    builder.setValue(s"$executable $cmdOptions $jar $appArguments")
+    val primaryResource = new File(sandboxPath, desc.jarUrl.split("/").last).toString()
+    val cmdOptions = generateCmdOption(desc, sandboxPath).mkString(" ")
+    val appArguments = desc.command.arguments.mkString(" ")
+    builder.setValue(s"$executable $cmdOptions $primaryResource $appArguments")
     builder.setEnvironment(envBuilder.build())
     conf.getOption("spark.mesos.uris").map { uris =>
       setupUris(uris, builder)
     }
+    desc.schedulerProperties.get("spark.mesos.uris").map { uris =>
+      setupUris(uris, builder)
+    }
+    desc.schedulerProperties.get("spark.submit.pyFiles").map { pyFiles =>
+      setupUris(pyFiles, builder)
+    }
     builder.build()
   }
 
-  private def generateCmdOption(desc: MesosDriverDescription): Seq[String] = {
+  private def generateCmdOption(desc: MesosDriverDescription, sandboxPath: String): Seq[String] = {
     var options = Seq(
       "--name", desc.schedulerProperties("spark.app.name"),
-      "--class", desc.command.mainClass,
       "--master", s"mesos://${conf.get("spark.master")}",
       "--driver-cores", desc.cores.toString,
       "--driver-memory", s"${desc.mem}M")
+
+    // Assume empty main class means we're running python
+    if (!desc.command.mainClass.equals("")) {
+      options ++= Seq("--class", desc.command.mainClass)
+    }
+
     desc.schedulerProperties.get("spark.executor.memory").map { v =>
       options ++= Seq("--executor-memory", v)
     }
     desc.schedulerProperties.get("spark.cores.max").map { v =>
       options ++= Seq("--total-executor-cores", v)
     }
+    desc.schedulerProperties.get("spark.submit.pyFiles").map { pyFiles =>
+      val formattedFiles = pyFiles.split(",")
+        .map { path => new File(sandboxPath, path.split("/").last).toString() }
+        .mkString(",")
+      options ++= Seq("--py-files", formattedFiles)
+    }
     options
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 2e424054be785..8edf7007a5daf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -32,7 +32,6 @@ import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.util.Utils
 
-
 /**
  * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a
  * separate Mesos task, allowing multiple applications to share cluster nodes both in space (tasks
@@ -127,7 +126,7 @@ private[spark] class MesosSchedulerBackend(
     }
     val builder = MesosExecutorInfo.newBuilder()
     val (resourcesAfterCpu, usedCpuResources) =
-      partitionResources(availableResources, "cpus", scheduler.CPUS_PER_TASK)
+      partitionResources(availableResources, "cpus", mesosExecutorCores)
     val (resourcesAfterMem, usedMemResources) =
       partitionResources(resourcesAfterCpu.asJava, "mem", calculateTotalMemory(sc))
 
@@ -390,7 +389,7 @@ private[spark] class MesosSchedulerBackend(
                             slaveId: SlaveID, status: Int) {
     logInfo("Executor lost: %s, marking slave %s as lost".format(executorId.getValue,
                                                                  slaveId.getValue))
-    recordSlaveLost(d, slaveId, ExecutorExited(status))
+    recordSlaveLost(d, slaveId, ExecutorExited(status, isNormalExit = false))
   }
 
   override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 4a5274b46b7a0..b463a71d5bd7d 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -62,17 +62,34 @@ private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoa
   extends DeserializationStream {
 
   private val objIn = new ObjectInputStream(in) {
-    override def resolveClass(desc: ObjectStreamClass): Class[_] = {
-      // scalastyle:off classforname
-      Class.forName(desc.getName, false, loader)
-      // scalastyle:on classforname
-    }
+    override def resolveClass(desc: ObjectStreamClass): Class[_] =
+      try {
+        // scalastyle:off classforname
+        Class.forName(desc.getName, false, loader)
+        // scalastyle:on classforname
+      } catch {
+        case e: ClassNotFoundException =>
+          JavaDeserializationStream.primitiveMappings.get(desc.getName).getOrElse(throw e)
+      }
   }
 
   def readObject[T: ClassTag](): T = objIn.readObject().asInstanceOf[T]
   def close() { objIn.close() }
 }
 
+private object JavaDeserializationStream {
+  val primitiveMappings = Map[String, Class[_]](
+    "boolean" -> classOf[Boolean],
+    "byte" -> classOf[Byte],
+    "char" -> classOf[Char],
+    "short" -> classOf[Short],
+    "int" -> classOf[Int],
+    "long" -> classOf[Long],
+    "float" -> classOf[Float],
+    "double" -> classOf[Double],
+    "void" -> classOf[Void]
+  )
+}
 
 private[spark] class JavaSerializerInstance(
     counterReset: Int, extraDebugInfo: Boolean, defaultClassLoader: ClassLoader)
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index b977711e7d5ad..c5195c1143a8f 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -35,7 +35,6 @@ import org.roaringbitmap.{ArrayContainer, BitmapContainer, RoaringArray, Roaring
 import org.apache.spark._
 import org.apache.spark.api.python.PythonBroadcast
 import org.apache.spark.broadcast.HttpBroadcast
-import org.apache.spark.network.nio.{GetBlock, GotBlock, PutBlock}
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus}
 import org.apache.spark.storage._
@@ -362,9 +361,6 @@ private[serializer] object KryoSerializer {
   private val toRegister: Seq[Class[_]] = Seq(
     ByteBuffer.allocate(1).getClass,
     classOf[StorageLevel],
-    classOf[PutBlock],
-    classOf[GotBlock],
-    classOf[GetBlock],
     classOf[CompressedMapStatus],
     classOf[HighlyCompressedMapStatus],
     classOf[RoaringBitmap],
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
similarity index 94%
rename from core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
rename to core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index 0c8f08f0f3b1b..7c3e2b5a3703b 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -15,16 +15,19 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.hash
+package org.apache.spark.shuffle
 
 import org.apache.spark._
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader}
 import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator}
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
 
-private[spark] class HashShuffleReader[K, C](
+/**
+ * Fetches and reads the partitions in range [startPartition, endPartition) from a shuffle by
+ * requesting them from other nodes' block stores.
+ */
+private[spark] class BlockStoreShuffleReader[K, C](
     handle: BaseShuffleHandle[K, _, C],
     startPartition: Int,
     endPartition: Int,
@@ -33,9 +36,6 @@ private[spark] class HashShuffleReader[K, C](
     mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker)
   extends ShuffleReader[K, C] with Logging {
 
-  require(endPartition == startPartition + 1,
-    "Hash shuffle currently only supports fetching one partition")
-
   private val dep = handle.dependency
 
   /** Read the combined key-values for this reduce task */
@@ -44,7 +44,7 @@ private[spark] class HashShuffleReader[K, C](
       context,
       blockManager.shuffleClient,
       blockManager,
-      mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition),
+      mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition),
       // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
       SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
index c057de9b3f4df..cd253a78c2b19 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.shuffle
 
-import java.io.File
 import java.util.concurrent.ConcurrentLinkedQueue
-import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.JavaConverters._
 
@@ -28,10 +26,8 @@ import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.shuffle.FileShuffleBlockResolver.ShuffleFileGroup
 import org.apache.spark.storage._
 import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap}
-import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
@@ -43,24 +39,7 @@ private[spark] trait ShuffleWriterGroup {
 
 /**
  * Manages assigning disk-based block writers to shuffle tasks. Each shuffle task gets one file
- * per reducer (this set of files is called a ShuffleFileGroup).
- *
- * As an optimization to reduce the number of physical shuffle files produced, multiple shuffle
- * blocks are aggregated into the same file. There is one "combined shuffle file" per reducer
- * per concurrently executing shuffle task. As soon as a task finishes writing to its shuffle
- * files, it releases them for another task.
- * Regarding the implementation of this feature, shuffle files are identified by a 3-tuple:
- *   - shuffleId: The unique id given to the entire shuffle stage.
- *   - bucketId: The id of the output partition (i.e., reducer id)
- *   - fileId: The unique id identifying a group of "combined shuffle files." Only one task at a
- *       time owns a particular fileId, and this id is returned to a pool when the task finishes.
- * Each shuffle file is then mapped to a FileSegment, which is a 3-tuple (file, offset, length)
- * that specifies where in a given file the actual block data is located.
- *
- * Shuffle file metadata is stored in a space-efficient manner. Rather than simply mapping
- * ShuffleBlockIds directly to FileSegments, each ShuffleFileGroup maintains a list of offsets for
- * each block stored in each file. In order to find the location of a shuffle block, we search the
- * files within a ShuffleFileGroups associated with the block's reducer.
+ * per reducer.
  */
 // Note: Changes to the format in this file should be kept in sync with
 // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getHashBasedShuffleBlockData().
@@ -71,26 +50,15 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
 
   private lazy val blockManager = SparkEnv.get.blockManager
 
-  // Turning off shuffle file consolidation causes all shuffle Blocks to get their own file.
-  // TODO: Remove this once the shuffle file consolidation feature is stable.
-  private val consolidateShuffleFiles =
-    conf.getBoolean("spark.shuffle.consolidateFiles", false)
-
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   private val bufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
 
   /**
-   * Contains all the state related to a particular shuffle. This includes a pool of unused
-   * ShuffleFileGroups, as well as all ShuffleFileGroups that have been created for the shuffle.
+   * Contains all the state related to a particular shuffle.
    */
-  private class ShuffleState(val numBuckets: Int) {
-    val nextFileId = new AtomicInteger(0)
-    val unusedFileGroups = new ConcurrentLinkedQueue[ShuffleFileGroup]()
-    val allFileGroups = new ConcurrentLinkedQueue[ShuffleFileGroup]()
-
+  private class ShuffleState(val numReducers: Int) {
     /**
      * The mapIds of all map tasks completed on this Executor for this shuffle.
-     * NB: This is only populated if consolidateShuffleFiles is FALSE. We don't need it otherwise.
      */
     val completedMapTasks = new ConcurrentLinkedQueue[Int]()
   }
@@ -104,24 +72,16 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
    * Get a ShuffleWriterGroup for the given map task, which will register it as complete
    * when the writers are closed successfully
    */
-  def forMapTask(shuffleId: Int, mapId: Int, numBuckets: Int, serializer: Serializer,
+  def forMapTask(shuffleId: Int, mapId: Int, numReducers: Int, serializer: Serializer,
       writeMetrics: ShuffleWriteMetrics): ShuffleWriterGroup = {
     new ShuffleWriterGroup {
-      shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
+      shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numReducers))
       private val shuffleState = shuffleStates(shuffleId)
-      private var fileGroup: ShuffleFileGroup = null
 
       val openStartTime = System.nanoTime
       val serializerInstance = serializer.newInstance()
-      val writers: Array[DiskBlockObjectWriter] = if (consolidateShuffleFiles) {
-        fileGroup = getUnusedFileGroup()
-        Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
-          val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
-          blockManager.getDiskWriter(blockId, fileGroup(bucketId), serializerInstance, bufferSize,
-            writeMetrics)
-        }
-      } else {
-        Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
+      val writers: Array[DiskBlockObjectWriter] = {
+        Array.tabulate[DiskBlockObjectWriter](numReducers) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           val blockFile = blockManager.diskBlockManager.getFile(blockId)
           // Because of previous failures, the shuffle file may already exist on this machine.
@@ -142,58 +102,14 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
       writeMetrics.incShuffleWriteTime(System.nanoTime - openStartTime)
 
       override def releaseWriters(success: Boolean) {
-        if (consolidateShuffleFiles) {
-          if (success) {
-            val offsets = writers.map(_.fileSegment().offset)
-            val lengths = writers.map(_.fileSegment().length)
-            fileGroup.recordMapOutput(mapId, offsets, lengths)
-          }
-          recycleFileGroup(fileGroup)
-        } else {
-          shuffleState.completedMapTasks.add(mapId)
-        }
-      }
-
-      private def getUnusedFileGroup(): ShuffleFileGroup = {
-        val fileGroup = shuffleState.unusedFileGroups.poll()
-        if (fileGroup != null) fileGroup else newFileGroup()
-      }
-
-      private def newFileGroup(): ShuffleFileGroup = {
-        val fileId = shuffleState.nextFileId.getAndIncrement()
-        val files = Array.tabulate[File](numBuckets) { bucketId =>
-          val filename = physicalFileName(shuffleId, bucketId, fileId)
-          blockManager.diskBlockManager.getFile(filename)
-        }
-        val fileGroup = new ShuffleFileGroup(shuffleId, fileId, files)
-        shuffleState.allFileGroups.add(fileGroup)
-        fileGroup
-      }
-
-      private def recycleFileGroup(group: ShuffleFileGroup) {
-        shuffleState.unusedFileGroups.add(group)
+        shuffleState.completedMapTasks.add(mapId)
       }
     }
   }
 
   override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = {
-    if (consolidateShuffleFiles) {
-      // Search all file groups associated with this shuffle.
-      val shuffleState = shuffleStates(blockId.shuffleId)
-      val iter = shuffleState.allFileGroups.iterator
-      while (iter.hasNext) {
-        val segmentOpt = iter.next.getFileSegmentFor(blockId.mapId, blockId.reduceId)
-        if (segmentOpt.isDefined) {
-          val segment = segmentOpt.get
-          return new FileSegmentManagedBuffer(
-            transportConf, segment.file, segment.offset, segment.length)
-        }
-      }
-      throw new IllegalStateException("Failed to find shuffle block: " + blockId)
-    } else {
-      val file = blockManager.diskBlockManager.getFile(blockId)
-      new FileSegmentManagedBuffer(transportConf, file, 0, file.length)
-    }
+    val file = blockManager.diskBlockManager.getFile(blockId)
+    new FileSegmentManagedBuffer(transportConf, file, 0, file.length)
   }
 
   /** Remove all the blocks / files and metadata related to a particular shuffle. */
@@ -209,16 +125,11 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
   private def removeShuffleBlocks(shuffleId: ShuffleId): Boolean = {
     shuffleStates.get(shuffleId) match {
       case Some(state) =>
-        if (consolidateShuffleFiles) {
-          for (fileGroup <- state.allFileGroups.asScala;
-               file <- fileGroup.files) {
-            file.delete()
-          }
-        } else {
-          for (mapId <- state.completedMapTasks.asScala;
-               reduceId <- 0 until state.numBuckets) {
-            val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId)
-            blockManager.diskBlockManager.getFile(blockId).delete()
+        for (mapId <- state.completedMapTasks.asScala; reduceId <- 0 until state.numReducers) {
+          val blockId = new ShuffleBlockId(shuffleId, mapId, reduceId)
+          val file = blockManager.diskBlockManager.getFile(blockId)
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file.getPath()}")
           }
         }
         logInfo("Deleted all files for shuffle " + shuffleId)
@@ -229,10 +140,6 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
     }
   }
 
-  private def physicalFileName(shuffleId: Int, bucketId: Int, fileId: Int) = {
-    "merged_shuffle_%d_%d_%d".format(shuffleId, bucketId, fileId)
-  }
-
   private def cleanup(cleanupTime: Long) {
     shuffleStates.clearOldValues(cleanupTime, (shuffleId, state) => removeShuffleBlocks(shuffleId))
   }
@@ -241,59 +148,3 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
     metadataCleaner.cancel()
   }
 }
-
-private[spark] object FileShuffleBlockResolver {
-  /**
-   * A group of shuffle files, one per reducer.
-   * A particular mapper will be assigned a single ShuffleFileGroup to write its output to.
-   */
-  private class ShuffleFileGroup(val shuffleId: Int, val fileId: Int, val files: Array[File]) {
-    private var numBlocks: Int = 0
-
-    /**
-     * Stores the absolute index of each mapId in the files of this group. For instance,
-     * if mapId 5 is the first block in each file, mapIdToIndex(5) = 0.
-     */
-    private val mapIdToIndex = new PrimitiveKeyOpenHashMap[Int, Int]()
-
-    /**
-     * Stores consecutive offsets and lengths of blocks into each reducer file, ordered by
-     * position in the file.
-     * Note: mapIdToIndex(mapId) returns the index of the mapper into the vector for every
-     * reducer.
-     */
-    private val blockOffsetsByReducer = Array.fill[PrimitiveVector[Long]](files.length) {
-      new PrimitiveVector[Long]()
-    }
-    private val blockLengthsByReducer = Array.fill[PrimitiveVector[Long]](files.length) {
-      new PrimitiveVector[Long]()
-    }
-
-    def apply(bucketId: Int): File = files(bucketId)
-
-    def recordMapOutput(mapId: Int, offsets: Array[Long], lengths: Array[Long]) {
-      assert(offsets.length == lengths.length)
-      mapIdToIndex(mapId) = numBlocks
-      numBlocks += 1
-      for (i <- 0 until offsets.length) {
-        blockOffsetsByReducer(i) += offsets(i)
-        blockLengthsByReducer(i) += lengths(i)
-      }
-    }
-
-    /** Returns the FileSegment associated with the given map task, or None if no entry exists. */
-    def getFileSegmentFor(mapId: Int, reducerId: Int): Option[FileSegment] = {
-      val file = files(reducerId)
-      val blockOffsets = blockOffsetsByReducer(reducerId)
-      val blockLengths = blockLengthsByReducer(reducerId)
-      val index = mapIdToIndex.getOrElse(mapId, -1)
-      if (index >= 0) {
-        val offset = blockOffsets(index)
-        val length = blockLengths(index)
-        Some(new FileSegment(file, offset, length))
-      } else {
-        None
-      }
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index d0163d326dba7..5e4c2b5d0a5c4 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -21,7 +21,7 @@ import java.io._
 
 import com.google.common.io.ByteStreams
 
-import org.apache.spark.{SparkConf, SparkEnv}
+import org.apache.spark.{SparkConf, SparkEnv, Logging}
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.storage._
@@ -40,7 +40,8 @@ import IndexShuffleBlockResolver.NOOP_REDUCE_ID
  */
 // Note: Changes to the format in this file should be kept in sync with
 // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getSortBasedShuffleBlockData().
-private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleBlockResolver {
+private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleBlockResolver
+  with Logging {
 
   private lazy val blockManager = SparkEnv.get.blockManager
 
@@ -60,12 +61,16 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
   def removeDataByMap(shuffleId: Int, mapId: Int): Unit = {
     var file = getDataFile(shuffleId, mapId)
     if (file.exists()) {
-      file.delete()
+      if (!file.delete()) {
+        logWarning(s"Error deleting data ${file.getPath()}")
+      }
     }
 
     file = getIndexFile(shuffleId, mapId)
     if (file.exists()) {
-      file.delete()
+      if (!file.delete()) {
+        logWarning(s"Error deleting index ${file.getPath()}")
+      }
     }
   }
 
@@ -114,9 +119,8 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
 }
 
 private[spark] object IndexShuffleBlockResolver {
-  // No-op reduce ID used in interactions with disk store and DiskBlockObjectWriter.
+  // No-op reduce ID used in interactions with disk store.
   // The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort
   // shuffle outputs for several reduces are glommed into a single file.
-  // TODO: Avoid this entirely by having the DiskBlockObjectWriter not require a BlockId.
   val NOOP_REDUCE_ID = 0
 }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
index c089088f409dd..d2e2fc4c110a7 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleManager.scala
@@ -24,7 +24,13 @@ import org.apache.spark.shuffle._
  * A ShuffleManager using hashing, that creates one output file per reduce partition on each
  * mapper (possibly reusing these across waves of tasks).
  */
-private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager {
+private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
+
+  if (!conf.getBoolean("spark.shuffle.spill", true)) {
+    logWarning(
+      "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." +
+        " Shuffle will continue to spill to disk when necessary.")
+  }
 
   private val fileShuffleBlockResolver = new FileShuffleBlockResolver(conf)
 
@@ -45,7 +51,7 @@ private[spark] class HashShuffleManager(conf: SparkConf) extends ShuffleManager
       startPartition: Int,
       endPartition: Int,
       context: TaskContext): ShuffleReader[K, C] = {
-    new HashShuffleReader(
+    new BlockStoreShuffleReader(
       handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
index d7fab351ca3b8..9df4e551669cc 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -19,11 +19,16 @@ package org.apache.spark.shuffle.sort
 
 import java.util.concurrent.ConcurrentHashMap
 
-import org.apache.spark.{SparkConf, TaskContext, ShuffleDependency}
+import org.apache.spark.{Logging, SparkConf, TaskContext, ShuffleDependency}
 import org.apache.spark.shuffle._
-import org.apache.spark.shuffle.hash.HashShuffleReader
 
-private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager {
+private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
+
+  if (!conf.getBoolean("spark.shuffle.spill", true)) {
+    logWarning(
+      "spark.shuffle.spill was set to false, but this configuration is ignored as of Spark 1.6+." +
+        " Shuffle will continue to spill to disk when necessary.")
+  }
 
   private val indexShuffleBlockResolver = new IndexShuffleBlockResolver(conf)
   private val shuffleMapNumber = new ConcurrentHashMap[Int, Int]()
@@ -48,7 +53,7 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
       endPartition: Int,
       context: TaskContext): ShuffleReader[K, C] = {
     // We currently use the same block store shuffle fetcher as the hash-based shuffle.
-    new HashShuffleReader(
+    new BlockStoreShuffleReader(
       handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
index df7bbd64247dd..75f22f642b9d1 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleManager.scala
@@ -159,7 +159,7 @@ private[spark] class UnsafeShuffleManager(conf: SparkConf) extends ShuffleManage
       mapId: Int,
       context: TaskContext): ShuffleWriter[K, V] = {
     handle match {
-      case unsafeShuffleHandle: UnsafeShuffleHandle[K, V] =>
+      case unsafeShuffleHandle: UnsafeShuffleHandle[K @unchecked, V @unchecked] =>
         numMapsForShufflesThatUsedNewPath.putIfAbsent(handle.shuffleId, unsafeShuffleHandle.numMaps)
         val env = SparkEnv.get
         new UnsafeShuffleWriter(
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
index 390c136df79b3..24a0b5220695c 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
@@ -127,7 +127,7 @@ private[v1] object AllStagesResource {
     new TaskData(
       taskId = uiData.taskInfo.taskId,
       index = uiData.taskInfo.index,
-      attempt = uiData.taskInfo.attempt,
+      attempt = uiData.taskInfo.attemptNumber,
       launchTime = new Date(uiData.taskInfo.launchTime),
       executorId = uiData.taskInfo.executorId,
       host = uiData.taskInfo.host,
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala b/core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala
similarity index 60%
rename from core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala
rename to core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala
index 1cd13d887c6f6..f6e46ae9a481a 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManagerId.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockFetchException.scala
@@ -15,23 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.network.nio
+package org.apache.spark.storage
 
-import java.net.InetSocketAddress
+import org.apache.spark.SparkException
 
-import org.apache.spark.util.Utils
-
-private[nio] case class ConnectionManagerId(host: String, port: Int) {
-  // DEBUG code
-  Utils.checkHost(host)
-  assert (port > 0)
-
-  def toSocketAddress(): InetSocketAddress = new InetSocketAddress(host, port)
-}
-
-
-private[nio] object ConnectionManagerId {
-  def fromSocketAddress(socketAddress: InetSocketAddress): ConnectionManagerId = {
-    new ConnectionManagerId(socketAddress.getHostName, socketAddress.getPort)
-  }
-}
+private[spark]
+case class BlockFetchException(messages: String, throwable: Throwable)
+  extends SparkException(messages, throwable)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index fefaef0ab82c8..47bd2ef8b2941 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -23,6 +23,7 @@ import java.nio.{ByteBuffer, MappedByteBuffer}
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.concurrent.{ExecutionContext, Await, Future}
 import scala.concurrent.duration._
+import scala.util.control.NonFatal
 import scala.util.Random
 
 import sun.nio.ch.DirectBuffer
@@ -105,15 +106,6 @@ private[spark] class BlockManager(
     }
   }
 
-  // Check that we're not using external shuffle service with consolidated shuffle files.
-  if (externalShuffleServiceEnabled
-      && conf.getBoolean("spark.shuffle.consolidateFiles", false)
-      && shuffleManager.isInstanceOf[HashShuffleManager]) {
-    throw new UnsupportedOperationException("Cannot use external shuffle service with consolidated"
-      + " shuffle files in hash-based shuffle. Please disable spark.shuffle.consolidateFiles or "
-      + " switch to sort-based shuffle.")
-  }
-
   var blockManagerId: BlockManagerId = _
 
   // Address of the server that serves this executor's shuffle files. This is either an external
@@ -600,10 +592,26 @@ private[spark] class BlockManager(
   private def doGetRemote(blockId: BlockId, asBlockResult: Boolean): Option[Any] = {
     require(blockId != null, "BlockId is null")
     val locations = Random.shuffle(master.getLocations(blockId))
+    var numFetchFailures = 0
     for (loc <- locations) {
       logDebug(s"Getting remote block $blockId from $loc")
-      val data = blockTransferService.fetchBlockSync(
-        loc.host, loc.port, loc.executorId, blockId.toString).nioByteBuffer()
+      val data = try {
+        blockTransferService.fetchBlockSync(
+          loc.host, loc.port, loc.executorId, blockId.toString).nioByteBuffer()
+      } catch {
+        case NonFatal(e) =>
+          numFetchFailures += 1
+          if (numFetchFailures == locations.size) {
+            // An exception is thrown while fetching this block from all locations
+            throw new BlockFetchException(s"Failed to fetch block from" +
+              s" ${locations.size} locations. Most recent failure cause:", e)
+          } else {
+            // This location failed, so we retry fetch from a different one by returning null here
+            logWarning(s"Failed to fetch remote block $blockId " +
+              s"from $loc (failed attempt $numFetchFailures)", e)
+            null
+          }
+      }
 
       if (data != null) {
         if (asBlockResult) {
@@ -661,7 +669,7 @@ private[spark] class BlockManager(
       writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
     val compressStream: OutputStream => OutputStream = wrapForCompression(blockId, _)
     val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
-    new DiskBlockObjectWriter(blockId, file, serializerInstance, bufferSize, compressStream,
+    new DiskBlockObjectWriter(file, serializerInstance, bufferSize, compressStream,
       syncWrites, writeMetrics)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 3f8d26e1d4cab..f7e84a2c2e14c 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -164,7 +164,9 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
 
   private def doStop(): Unit = {
     // Only perform cleanup if an external service is not serving our shuffle files.
-    if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) {
+    // Also blockManagerId could be null if block manager is not initialized properly.
+    if (!blockManager.externalShuffleServiceEnabled ||
+      (blockManager.blockManagerId != null && blockManager.blockManagerId.isDriver)) {
       localDirs.foreach { localDir =>
         if (localDir.isDirectory() && localDir.exists()) {
           try {
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index 49d9154f95a5b..80d426fadc65e 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -34,7 +34,6 @@ import org.apache.spark.util.Utils
  * reopened again.
  */
 private[spark] class DiskBlockObjectWriter(
-    val blockId: BlockId,
     file: File,
     serializerInstance: SerializerInstance,
     bufferSize: Int,
@@ -144,8 +143,10 @@ private[spark] class DiskBlockObjectWriter(
    * Reverts writes that haven't been flushed yet. Callers should invoke this function
    * when there are runtime exceptions. This method will not throw, though it may be
    * unsuccessful in truncating written data.
+   *
+   * @return the file that this DiskBlockObjectWriter wrote to.
    */
-  def revertPartialWritesAndClose() {
+  def revertPartialWritesAndClose(): File = {
     // Discard current writes. We do this by flushing the outstanding writes and then
     // truncating the file to its initial position.
     try {
@@ -160,12 +161,14 @@ private[spark] class DiskBlockObjectWriter(
       val truncateStream = new FileOutputStream(file, true)
       try {
         truncateStream.getChannel.truncate(initialPosition)
+        file
       } finally {
         truncateStream.close()
       }
     } catch {
       case e: Exception =>
         logError("Uncaught exception while reverting partial writes to file " + file, e)
+        file
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 1f45956282166..c008b9dc16327 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -86,7 +86,9 @@ private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBloc
     } catch {
       case e: Throwable =>
         if (file.exists()) {
-          file.delete()
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file}")
+          }
         }
         throw e
     }
@@ -154,11 +156,12 @@ private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBloc
 
   override def remove(blockId: BlockId): Boolean = {
     val file = diskManager.getFile(blockId.name)
-    // If consolidation mode is used With HashShuffleMananger, the physical filename for the block
-    // is different from blockId.name. So the file returns here will not be exist, thus we avoid to
-    // delete the whole consolidated file by mistake.
     if (file.exists()) {
-      file.delete()
+      val ret = file.delete()
+      if (!ret) {
+        logWarning(s"Error deleting ${file.getPath()}")
+      }
+      ret
     } else {
       false
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index a759ceb96ec1e..0d0448feb5b06 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -260,10 +260,7 @@ final class ShuffleBlockFetcherIterator(
     fetchRequests ++= Utils.randomize(remoteRequests)
 
     // Send out initial requests for blocks, up to our maxBytesInFlight
-    while (fetchRequests.nonEmpty &&
-      (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
-      sendRequest(fetchRequests.dequeue())
-    }
+    fetchUpToMaxBytes()
 
     val numFetches = remoteRequests.size - fetchRequests.size
     logInfo("Started " + numFetches + " remote fetches in" + Utils.getUsedTimeMs(startTime))
@@ -296,10 +293,7 @@ final class ShuffleBlockFetcherIterator(
       case _ =>
     }
     // Send fetch requests up to maxBytesInFlight
-    while (fetchRequests.nonEmpty &&
-      (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
-      sendRequest(fetchRequests.dequeue())
-    }
+    fetchUpToMaxBytes()
 
     result match {
       case FailureFetchResult(blockId, address, e) =>
@@ -315,6 +309,14 @@ final class ShuffleBlockFetcherIterator(
     }
   }
 
+  private def fetchUpToMaxBytes(): Unit = {
+    // Send fetch requests up to maxBytesInFlight
+    while (fetchRequests.nonEmpty &&
+      (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) {
+      sendRequest(fetchRequests.dequeue())
+    }
+  }
+
   private def throwFetchFailedException(blockId: BlockId, address: BlockManagerId, e: Throwable) = {
     blockId match {
       case ShuffleBlockId(shufId, mapId, reduceId) =>
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 779c0ba083596..b796a44fe01ac 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -59,7 +59,17 @@ private[spark] object JettyUtils extends Logging {
 
   def createServlet[T <% AnyRef](
       servletParams: ServletParams[T],
-      securityMgr: SecurityManager): HttpServlet = {
+      securityMgr: SecurityManager,
+      conf: SparkConf): HttpServlet = {
+
+    // SPARK-10589 avoid frame-related click-jacking vulnerability, using X-Frame-Options
+    // (see http://tools.ietf.org/html/rfc7034). By default allow framing only from the
+    // same origin, but allow framing for a specific named URI.
+    // Example: spark.ui.allowFramingFrom = https://example.com/
+    val allowFramingFrom = conf.getOption("spark.ui.allowFramingFrom")
+    val xFrameOptionsValue =
+      allowFramingFrom.map(uri => s"ALLOW-FROM $uri").getOrElse("SAMEORIGIN")
+
     new HttpServlet {
       override def doGet(request: HttpServletRequest, response: HttpServletResponse) {
         try {
@@ -68,6 +78,7 @@ private[spark] object JettyUtils extends Logging {
             response.setStatus(HttpServletResponse.SC_OK)
             val result = servletParams.responder(request)
             response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
+            response.setHeader("X-Frame-Options", xFrameOptionsValue)
             // scalastyle:off println
             response.getWriter.println(servletParams.extractFn(result))
             // scalastyle:on println
@@ -97,8 +108,9 @@ private[spark] object JettyUtils extends Logging {
       path: String,
       servletParams: ServletParams[T],
       securityMgr: SecurityManager,
+      conf: SparkConf,
       basePath: String = ""): ServletContextHandler = {
-    createServletHandler(path, createServlet(servletParams, securityMgr), basePath)
+    createServletHandler(path, createServlet(servletParams, securityMgr, conf), basePath)
   }
 
   /** Create a context handler that responds to a request with the given path prefix */
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index d8b90568b7b9a..99085ada9f0af 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -56,6 +56,8 @@ private[spark] class SparkUI private (
 
   val stagesTab = new StagesTab(this)
 
+  var appId: String = _
+
   /** Initialize all components of the server. */
   def initialize() {
     attachTab(new JobsTab(this))
@@ -75,9 +77,8 @@ private[spark] class SparkUI private (
 
   def getAppName: String = appName
 
-  /** Set the app name for this UI. */
-  def setAppName(name: String) {
-    appName = name
+  def setAppId(id: String): Unit = {
+    appId = id
   }
 
   /** Stop the server behind this web interface. Only valid after bind(). */
@@ -94,12 +95,12 @@ private[spark] class SparkUI private (
   private[spark] def appUIAddress = s"http://$appUIHostPort"
 
   def getSparkUI(appId: String): Option[SparkUI] = {
-    if (appId == appName) Some(this) else None
+    if (appId == this.appId) Some(this) else None
   }
 
   def getApplicationInfoList: Iterator[ApplicationInfo] = {
     Iterator(new ApplicationInfo(
-      id = appName,
+      id = appId,
       name = appName,
       attempts = Seq(new ApplicationAttemptInfo(
         attemptId = None,
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index f2da417724104..21dc8f0b65485 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.ui
 
 import java.text.SimpleDateFormat
-import java.util.{Locale, Date}
+import java.util.{Date, Locale}
 
-import scala.xml.{Node, Text, Unparsed}
+import scala.util.control.NonFatal
+import scala.xml._
+import scala.xml.transform.{RewriteRule, RuleTransformer}
 
 import org.apache.spark.Logging
 import org.apache.spark.ui.scope.RDDOperationGraph
@@ -395,4 +397,60 @@ private[spark] object UIUtils extends Logging {
     </script>
   }
 
+  /**
+   * Returns HTML rendering of a job or stage description. It will try to parse the string as HTML
+   * and make sure that it only contains anchors with root-relative links. Otherwise,
+   * the whole string will rendered as a simple escaped text.
+   *
+   * Note: In terms of security, only anchor tags with root relative links are supported. So any
+   * attempts to embed links outside Spark UI, or other tags like <script> will cause in the whole
+   * description to be treated as plain text.
+   */
+  def makeDescription(desc: String, basePathUri: String): NodeSeq = {
+    import scala.language.postfixOps
+
+    // If the description can be parsed as HTML and has only relative links, then render
+    // as HTML, otherwise render as escaped string
+    try {
+      // Try to load the description as unescaped HTML
+      val xml = XML.loadString(s"""<span class="description-input">$desc</span>""")
+
+      // Verify that this has only anchors and span (we are wrapping in span)
+      val allowedNodeLabels = Set("a", "span")
+      val illegalNodes = xml \\ "_"  filterNot { case node: Node =>
+        allowedNodeLabels.contains(node.label)
+      }
+      if (illegalNodes.nonEmpty) {
+        throw new IllegalArgumentException(
+          "Only HTML anchors allowed in job descriptions\n" +
+            illegalNodes.map { n => s"${n.label} in $n"}.mkString("\n\t"))
+      }
+
+      // Verify that all links are relative links starting with "/"
+      val allLinks =
+        xml \\ "a" flatMap { _.attributes } filter { _.key == "href" } map { _.value.toString }
+      if (allLinks.exists { ! _.startsWith ("/") }) {
+        throw new IllegalArgumentException(
+          "Links in job descriptions must be root-relative:\n" + allLinks.mkString("\n\t"))
+      }
+
+      // Prepend the relative links with basePathUri
+      val rule = new RewriteRule() {
+        override def transform(n: Node): Seq[Node] = {
+          n match {
+            case e: Elem if e \ "@href" nonEmpty =>
+              val relativePath = e.attribute("href").get.toString
+              val fullUri = s"${basePathUri.stripSuffix("/")}/${relativePath.stripPrefix("/")}"
+              e % Attribute(null, "href", fullUri, Null)
+            case _ => n
+          }
+        }
+      }
+      new RuleTransformer(rule).transform(xml)
+    } catch {
+      case NonFatal(e) =>
+        logWarning(s"Invalid job description: $desc ", e)
+        <span class="description-input">{desc}</span>
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 61449847add3d..81a121fd441bd 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -76,9 +76,9 @@ private[spark] abstract class WebUI(
   def attachPage(page: WebUIPage) {
     val pagePath = "/" + page.prefix
     val renderHandler = createServletHandler(pagePath,
-      (request: HttpServletRequest) => page.render(request), securityManager, basePath)
+      (request: HttpServletRequest) => page.render(request), securityManager, conf, basePath)
     val renderJsonHandler = createServletHandler(pagePath.stripSuffix("/") + "/json",
-      (request: HttpServletRequest) => page.renderJson(request), securityManager, basePath)
+      (request: HttpServletRequest) => page.renderJson(request), securityManager, conf, basePath)
     attachHandler(renderHandler)
     attachHandler(renderJsonHandler)
     pageToHandlers.getOrElseUpdate(page, ArrayBuffer[ServletContextHandler]())
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index e72547df7254b..041cd55ea483b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ui.jobs
 
-import scala.collection.mutable.{HashMap, ListBuffer}
-import scala.xml.{Node, NodeSeq, Unparsed, Utility}
-
 import java.util.Date
 import javax.servlet.http.HttpServletRequest
 
-import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage}
-import org.apache.spark.ui.jobs.UIData.{ExecutorUIData, JobUIData}
+import scala.collection.mutable.{HashMap, ListBuffer}
+import scala.xml._
+
 import org.apache.spark.JobExecutionStatus
+import org.apache.spark.ui.jobs.UIData.{ExecutorUIData, JobUIData}
+import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage}
 
 /** Page showing list of all ongoing and recently finished jobs */
 private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
@@ -224,6 +224,8 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       }
       val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
       val formattedSubmissionTime = job.submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
+      val jobDescription = UIUtils.makeDescription(lastStageDescription, parent.basePath)
+
       val detailUrl =
         "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), job.jobId)
       <tr id={"job-" + job.jobId}>
@@ -231,7 +233,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
           {job.jobId} {job.jobGroup.map(id => s"($id)").getOrElse("")}
         </td>
         <td>
-          <span class="description-input" title={lastStageDescription}>{lastStageDescription}</span>
+          {jobDescription}
           <a href={detailUrl} class="name-link">{lastStageName}</a>
         </td>
         <td sorttable_customkey={job.submissionTime.getOrElse(-1).toString}>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 4adc6596ba21c..712782d27b3cf 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -368,7 +368,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           val peakExecutionMemory = validTasks.map { case TaskUIData(info, _, _) =>
             info.accumulables
               .find { acc => acc.name == InternalAccumulator.PEAK_EXECUTION_MEMORY }
-              .map { acc => acc.value.toLong }
+              .map { acc => acc.update.getOrElse("0").toLong }
               .getOrElse(0L)
               .toDouble
           }
@@ -621,7 +621,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           serializationTimeProportionPos + serializationTimeProportion
 
         val index = taskInfo.index
-        val attempt = taskInfo.attempt
+        val attempt = taskInfo.attemptNumber
 
         val svgTag =
           if (totalExecutionTime == 0) {
@@ -967,7 +967,7 @@ private[ui] class TaskDataSource(
     new TaskTableRowData(
       info.index,
       info.taskId,
-      info.attempt,
+      info.attemptNumber,
       info.speculative,
       info.status,
       info.taskLocality.toString,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 99812db4912a3..ea806d09b6009 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.ui.jobs
 
-import scala.xml.Node
-import scala.xml.Text
-
 import java.util.Date
 
+import scala.xml.{Node, Text}
+
 import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.scheduler.StageInfo
@@ -116,7 +115,7 @@ private[ui] class StageTableBase(
       stageData <- listener.stageIdToData.get((s.stageId, s.attemptId))
       desc <- stageData.description
     } yield {
-      <span class="description-input" title={desc}>{desc}</span>
+      UIUtils.makeDescription(desc, basePathUri)
     }
     <div>{stageDesc.getOrElse("")} {killLink} {nameLink} {details}</div>
   }
diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 150d82b3930ef..1b49dca9dc78b 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -94,7 +94,7 @@ private[spark] object ClosureCleaner extends Logging {
     if (cls.isPrimitive) {
       cls match {
         case java.lang.Boolean.TYPE => new java.lang.Boolean(false)
-        case java.lang.Character.TYPE => new java.lang.Character('\0')
+        case java.lang.Character.TYPE => new java.lang.Character('\u0000')
         case java.lang.Void.TYPE =>
           // This should not happen because `Foo(void x) {}` does not compile.
           throw new IllegalStateException("Unexpected void parameter in constructor")
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index cbc94fd6d54d9..40729fa5a4ffe 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -266,7 +266,7 @@ private[spark] object JsonProtocol {
   def taskInfoToJson(taskInfo: TaskInfo): JValue = {
     ("Task ID" -> taskInfo.taskId) ~
     ("Index" -> taskInfo.index) ~
-    ("Attempt" -> taskInfo.attempt) ~
+    ("Attempt" -> taskInfo.attemptNumber) ~
     ("Launch Time" -> taskInfo.launchTime) ~
     ("Executor ID" -> taskInfo.executorId) ~
     ("Host" -> taskInfo.host) ~
@@ -362,8 +362,13 @@ private[spark] object JsonProtocol {
         ("Stack Trace" -> stackTrace) ~
         ("Full Stack Trace" -> exceptionFailure.fullStackTrace) ~
         ("Metrics" -> metrics)
-      case ExecutorLostFailure(executorId) =>
-        ("Executor ID" -> executorId)
+      case taskCommitDenied: TaskCommitDenied =>
+        ("Job ID" -> taskCommitDenied.jobID) ~
+        ("Partition ID" -> taskCommitDenied.partitionID) ~
+        ("Attempt Number" -> taskCommitDenied.attemptNumber)
+      case ExecutorLostFailure(executorId, isNormalExit) =>
+        ("Executor ID" -> executorId) ~
+        ("Normal Exit" -> isNormalExit)
       case _ => Utils.emptyJson
     }
     ("Reason" -> reason) ~ json
@@ -769,6 +774,7 @@ private[spark] object JsonProtocol {
     val exceptionFailure = Utils.getFormattedClassName(ExceptionFailure)
     val taskResultLost = Utils.getFormattedClassName(TaskResultLost)
     val taskKilled = Utils.getFormattedClassName(TaskKilled)
+    val taskCommitDenied = Utils.getFormattedClassName(TaskCommitDenied)
     val executorLostFailure = Utils.getFormattedClassName(ExecutorLostFailure)
     val unknownReason = Utils.getFormattedClassName(UnknownReason)
 
@@ -793,9 +799,19 @@ private[spark] object JsonProtocol {
         ExceptionFailure(className, description, stackTrace, fullStackTrace, metrics, None)
       case `taskResultLost` => TaskResultLost
       case `taskKilled` => TaskKilled
+      case `taskCommitDenied` =>
+        // Unfortunately, the `TaskCommitDenied` message was introduced in 1.3.0 but the JSON
+        // de/serialization logic was not added until 1.5.1. To provide backward compatibility
+        // for reading those logs, we need to provide default values for all the fields.
+        val jobId = Utils.jsonOption(json \ "Job ID").map(_.extract[Int]).getOrElse(-1)
+        val partitionId = Utils.jsonOption(json \ "Partition ID").map(_.extract[Int]).getOrElse(-1)
+        val attemptNo = Utils.jsonOption(json \ "Attempt Number").map(_.extract[Int]).getOrElse(-1)
+        TaskCommitDenied(jobId, partitionId, attemptNo)
       case `executorLostFailure` =>
+        val isNormalExit = Utils.jsonOption(json \ "Normal Exit").
+          map(_.extract[Boolean])
         val executorId = Utils.jsonOption(json \ "Executor ID").map(_.extract[String])
-        ExecutorLostFailure(executorId.getOrElse("Unknown"))
+        ExecutorLostFailure(executorId.getOrElse("Unknown"), isNormalExit.getOrElse(false))
       case `unknownReason` => UnknownReason
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
index 61ff9b89ec1c1..db4a8b304ec3e 100644
--- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
+++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
@@ -217,7 +217,9 @@ private [util] class SparkShutdownHookManager {
     }
     Try(Utils.classForName("org.apache.hadoop.util.ShutdownHookManager")) match {
       case Success(shmClass) =>
-        val fsPriority = classOf[FileSystem].getField("SHUTDOWN_HOOK_PRIORITY").get()
+        val fsPriority = classOf[FileSystem]
+          .getField("SHUTDOWN_HOOK_PRIORITY")
+          .get(null) // static field, the value is not used
           .asInstanceOf[Int]
         val shm = shmClass.getMethod("get").invoke(null)
         shm.getClass().getMethod("addShutdownHook", classOf[Runnable], classOf[Int])
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index ca5624a3d8b3d..22e291a2b48d6 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -21,6 +21,7 @@ package org.apache.spark.util
 import java.util.concurrent._
 
 import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
+import scala.util.control.NonFatal
 
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 
@@ -86,4 +87,62 @@ private[spark] object ThreadUtils {
     val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()
     Executors.newSingleThreadScheduledExecutor(threadFactory)
   }
+
+  /**
+   * Run a piece of code in a new thread and return the result. Exception in the new thread is
+   * thrown in the caller thread with an adjusted stack trace that removes references to this
+   * method for clarity. The exception stack traces will be like the following
+   *
+   * SomeException: exception-message
+   *   at CallerClass.body-method (sourcefile.scala)
+   *   at ... run in separate thread using org.apache.spark.util.ThreadUtils ... ()
+   *   at CallerClass.caller-method (sourcefile.scala)
+   *   ...
+   */
+  def runInNewThread[T](
+      threadName: String,
+      isDaemon: Boolean = true)(body: => T): T = {
+    @volatile var exception: Option[Throwable] = None
+    @volatile var result: T = null.asInstanceOf[T]
+
+    val thread = new Thread(threadName) {
+      override def run(): Unit = {
+        try {
+          result = body
+        } catch {
+          case NonFatal(e) =>
+            exception = Some(e)
+        }
+      }
+    }
+    thread.setDaemon(isDaemon)
+    thread.start()
+    thread.join()
+
+    exception match {
+      case Some(realException) =>
+        // Remove the part of the stack that shows method calls into this helper method
+        // This means drop everything from the top until the stack element
+        // ThreadUtils.runInNewThread(), and then drop that as well (hence the `drop(1)`).
+        val baseStackTrace = Thread.currentThread().getStackTrace().dropWhile(
+          ! _.getClassName.contains(this.getClass.getSimpleName)).drop(1)
+
+        // Remove the part of the new thread stack that shows methods call from this helper method
+        val extraStackTrace = realException.getStackTrace.takeWhile(
+          ! _.getClassName.contains(this.getClass.getSimpleName))
+
+        // Combine the two stack traces, with a place holder just specifying that there
+        // was a helper method used, without any further details of the helper
+        val placeHolderStackElem = new StackTraceElement(
+          s"... run in separate thread using ${ThreadUtils.getClass.getName.stripSuffix("$")} ..",
+          " ", "", -1)
+        val finalStackTrace = extraStackTrace ++ Seq(placeHolderStackElem) ++ baseStackTrace
+
+        // Update the stack trace and rethrow the exception in the caller thread
+        realException.setStackTrace(finalStackTrace)
+        throw realException
+      case None =>
+        result
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index 9c15b1188d91c..7ab67fc3a2de9 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -32,6 +32,17 @@ class BitSet(numBits: Int) extends Serializable {
    */
   def capacity: Int = numWords * 64
 
+  /**
+   * Clear all set bits.
+   */
+  def clear(): Unit = {
+    var i = 0
+    while (i < numWords) {
+      words(i) = 0L
+      i += 1
+    }
+  }
+
   /**
    * Set all the bits up to a given index
    */
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index f929b12606f0a..29c5732f5a8c1 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -208,7 +208,9 @@ class ExternalAppendOnlyMap[K, V, C](
           writer.revertPartialWritesAndClose()
         }
         if (file.exists()) {
-          file.delete()
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file}")
+          }
         }
       }
     }
@@ -489,7 +491,9 @@ class ExternalAppendOnlyMap[K, V, C](
         fileStream = null
       }
       if (file.exists()) {
-        file.delete()
+        if (!file.delete()) {
+          logWarning(s"Error deleting ${file}")
+        }
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 19287edbaf166..749be34d8e8fd 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -116,8 +116,6 @@ private[spark] class ExternalSorter[K, V, C](
   private val ser = Serializer.getSerializer(serializer)
   private val serInstance = ser.newInstance()
 
-  private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
-
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   private val fileBufferSize = conf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024
 
@@ -188,6 +186,12 @@ private[spark] class ExternalSorter[K, V, C](
 
   private val spills = new ArrayBuffer[SpilledFile]
 
+  /**
+   * Number of files this sorter has spilled so far.
+   * Exposed for testing.
+   */
+  private[spark] def numSpills: Int = spills.size
+
   override def insertAll(records: Iterator[Product2[K, V]]): Unit = {
     // TODO: stop combining if we find that the reduction factor isn't high
     val shouldCombine = aggregator.isDefined
@@ -223,10 +227,6 @@ private[spark] class ExternalSorter[K, V, C](
    * @param usingMap whether we're using a map or buffer as our current in-memory collection
    */
   private def maybeSpillCollection(usingMap: Boolean): Unit = {
-    if (!spillingEnabled) {
-      return
-    }
-
     var estimatedSize = 0L
     if (usingMap) {
       estimatedSize = map.estimateSize()
@@ -291,6 +291,8 @@ private[spark] class ExternalSorter[K, V, C](
       val it = collection.destructiveSortedWritablePartitionedIterator(comparator)
       while (it.hasNext) {
         val partitionId = it.nextPartition()
+        require(partitionId >= 0 && partitionId < numPartitions,
+          s"partition Id: ${partitionId} should be in the range [0, ${numPartitions})")
         it.writeNext(writer)
         elementsPerPartition(partitionId) += 1
         objectsWritten += 1
@@ -316,7 +318,9 @@ private[spark] class ExternalSorter[K, V, C](
           writer.revertPartialWritesAndClose()
         }
         if (file.exists()) {
-          file.delete()
+          if (!file.delete()) {
+            logWarning(s"Error deleting ${file}")
+          }
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
index 7138b4b8e4533..1e8476c4a047e 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
@@ -79,32 +79,30 @@ private[spark] class RollingFileAppender(
     val rolloverSuffix = rollingPolicy.generateRolledOverFileSuffix()
     val rolloverFile = new File(
       activeFile.getParentFile, activeFile.getName + rolloverSuffix).getAbsoluteFile
-    try {
-      logDebug(s"Attempting to rollover file $activeFile to file $rolloverFile")
-      if (activeFile.exists) {
-        if (!rolloverFile.exists) {
-          Files.move(activeFile, rolloverFile)
-          logInfo(s"Rolled over $activeFile to $rolloverFile")
-        } else {
-          // In case the rollover file name clashes, make a unique file name.
-          // The resultant file names are long and ugly, so this is used only
-          // if there is a name collision. This can be avoided by the using
-          // the right pattern such that name collisions do not occur.
-          var i = 0
-          var altRolloverFile: File = null
-          do {
-            altRolloverFile = new File(activeFile.getParent,
-              s"${activeFile.getName}$rolloverSuffix--$i").getAbsoluteFile
-            i += 1
-          } while (i < 10000 && altRolloverFile.exists)
-
-          logWarning(s"Rollover file $rolloverFile already exists, " +
-            s"rolled over $activeFile to file $altRolloverFile")
-          Files.move(activeFile, altRolloverFile)
-        }
+    logDebug(s"Attempting to rollover file $activeFile to file $rolloverFile")
+    if (activeFile.exists) {
+      if (!rolloverFile.exists) {
+        Files.move(activeFile, rolloverFile)
+        logInfo(s"Rolled over $activeFile to $rolloverFile")
       } else {
-        logWarning(s"File $activeFile does not exist")
+        // In case the rollover file name clashes, make a unique file name.
+        // The resultant file names are long and ugly, so this is used only
+        // if there is a name collision. This can be avoided by the using
+        // the right pattern such that name collisions do not occur.
+        var i = 0
+        var altRolloverFile: File = null
+        do {
+          altRolloverFile = new File(activeFile.getParent,
+            s"${activeFile.getName}$rolloverSuffix--$i").getAbsoluteFile
+          i += 1
+        } while (i < 10000 && altRolloverFile.exists)
+
+        logWarning(s"Rollover file $rolloverFile already exists, " +
+          s"rolled over $activeFile to file $altRolloverFile")
+        Files.move(activeFile, altRolloverFile)
       }
+    } else {
+      logWarning(s"File $activeFile does not exist")
     }
   }
 
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index ebd3d61ae7324..fd8f7f39b7cc8 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -90,7 +90,7 @@ public void sparkContextUnion() {
     JavaRDD<String> sUnion = sc.union(s1, s2);
     Assert.assertEquals(4, sUnion.count());
     // List
-    List<JavaRDD<String>> list = new ArrayList<JavaRDD<String>>();
+    List<JavaRDD<String>> list = new ArrayList<>();
     list.add(s2);
     sUnion = sc.union(s1, list);
     Assert.assertEquals(4, sUnion.count());
@@ -103,9 +103,9 @@ public void sparkContextUnion() {
     Assert.assertEquals(4, dUnion.count());
 
     // Union of JavaPairRDDs
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(1, 2));
-    pairs.add(new Tuple2<Integer, Integer>(3, 4));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(1, 2));
+    pairs.add(new Tuple2<>(3, 4));
     JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> pUnion = sc.union(p1, p2);
@@ -133,9 +133,9 @@ public void intersection() {
     JavaDoubleRDD dIntersection = d1.intersection(d2);
     Assert.assertEquals(2, dIntersection.count());
 
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(1, 2));
-    pairs.add(new Tuple2<Integer, Integer>(3, 4));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(1, 2));
+    pairs.add(new Tuple2<>(3, 4));
     JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> pIntersection = p1.intersection(p2);
@@ -165,47 +165,49 @@ public void randomSplit() {
 
   @Test
   public void sortByKey() {
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(0, 4));
-    pairs.add(new Tuple2<Integer, Integer>(3, 2));
-    pairs.add(new Tuple2<Integer, Integer>(-1, 1));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(0, 4));
+    pairs.add(new Tuple2<>(3, 2));
+    pairs.add(new Tuple2<>(-1, 1));
 
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
 
     // Default comparator
     JavaPairRDD<Integer, Integer> sortedRDD = rdd.sortByKey();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2));
 
     // Custom comparator
     sortedRDD = rdd.sortByKey(Collections.<Integer>reverseOrder(), false);
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2));
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void repartitionAndSortWithinPartitions() {
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(0, 5));
-    pairs.add(new Tuple2<Integer, Integer>(3, 8));
-    pairs.add(new Tuple2<Integer, Integer>(2, 6));
-    pairs.add(new Tuple2<Integer, Integer>(0, 8));
-    pairs.add(new Tuple2<Integer, Integer>(3, 8));
-    pairs.add(new Tuple2<Integer, Integer>(1, 3));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(0, 5));
+    pairs.add(new Tuple2<>(3, 8));
+    pairs.add(new Tuple2<>(2, 6));
+    pairs.add(new Tuple2<>(0, 8));
+    pairs.add(new Tuple2<>(3, 8));
+    pairs.add(new Tuple2<>(1, 3));
 
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
 
     Partitioner partitioner = new Partitioner() {
+      @Override
       public int numPartitions() {
         return 2;
       }
+      @Override
       public int getPartition(Object key) {
-        return ((Integer)key).intValue() % 2;
+        return (Integer) key % 2;
       }
     };
 
@@ -214,10 +216,10 @@ public int getPartition(Object key) {
     Assert.assertTrue(repartitioned.partitioner().isPresent());
     Assert.assertEquals(repartitioned.partitioner().get(), partitioner);
     List<List<Tuple2<Integer, Integer>>> partitions = repartitioned.glom().collect();
-    Assert.assertEquals(partitions.get(0), Arrays.asList(new Tuple2<Integer, Integer>(0, 5),
-        new Tuple2<Integer, Integer>(0, 8), new Tuple2<Integer, Integer>(2, 6)));
-    Assert.assertEquals(partitions.get(1), Arrays.asList(new Tuple2<Integer, Integer>(1, 3),
-        new Tuple2<Integer, Integer>(3, 8), new Tuple2<Integer, Integer>(3, 8)));
+    Assert.assertEquals(partitions.get(0),
+        Arrays.asList(new Tuple2<>(0, 5), new Tuple2<>(0, 8), new Tuple2<>(2, 6)));
+    Assert.assertEquals(partitions.get(1),
+        Arrays.asList(new Tuple2<>(1, 3), new Tuple2<>(3, 8), new Tuple2<>(3, 8)));
   }
 
   @Test
@@ -228,35 +230,37 @@ public void emptyRDD() {
 
   @Test
   public void sortBy() {
-    List<Tuple2<Integer, Integer>> pairs = new ArrayList<Tuple2<Integer, Integer>>();
-    pairs.add(new Tuple2<Integer, Integer>(0, 4));
-    pairs.add(new Tuple2<Integer, Integer>(3, 2));
-    pairs.add(new Tuple2<Integer, Integer>(-1, 1));
+    List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
+    pairs.add(new Tuple2<>(0, 4));
+    pairs.add(new Tuple2<>(3, 2));
+    pairs.add(new Tuple2<>(-1, 1));
 
     JavaRDD<Tuple2<Integer, Integer>> rdd = sc.parallelize(pairs);
 
     // compare on first value
     JavaRDD<Tuple2<Integer, Integer>> sortedRDD = rdd.sortBy(new Function<Tuple2<Integer, Integer>, Integer>() {
-      public Integer call(Tuple2<Integer, Integer> t) throws Exception {
+      @Override
+      public Integer call(Tuple2<Integer, Integer> t) {
         return t._1();
       }
     }, true, 2);
 
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2));
 
     // compare on second value
     sortedRDD = rdd.sortBy(new Function<Tuple2<Integer, Integer>, Integer>() {
-      public Integer call(Tuple2<Integer, Integer> t) throws Exception {
+      @Override
+      public Integer call(Tuple2<Integer, Integer> t) {
         return t._2();
       }
     }, true, 2);
-    Assert.assertEquals(new Tuple2<Integer, Integer>(-1, 1), sortedRDD.first());
+    Assert.assertEquals(new Tuple2<>(-1, 1), sortedRDD.first());
     sortedPairs = sortedRDD.collect();
-    Assert.assertEquals(new Tuple2<Integer, Integer>(3, 2), sortedPairs.get(1));
-    Assert.assertEquals(new Tuple2<Integer, Integer>(0, 4), sortedPairs.get(2));
+    Assert.assertEquals(new Tuple2<>(3, 2), sortedPairs.get(1));
+    Assert.assertEquals(new Tuple2<>(0, 4), sortedPairs.get(2));
   }
 
   @Test
@@ -265,7 +269,7 @@ public void foreach() {
     JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
     rdd.foreach(new VoidFunction<String>() {
       @Override
-      public void call(String s) throws IOException {
+      public void call(String s) {
         accum.add(1);
       }
     });
@@ -278,7 +282,7 @@ public void foreachPartition() {
     JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
     rdd.foreachPartition(new VoidFunction<Iterator<String>>() {
       @Override
-      public void call(Iterator<String> iter) throws IOException {
+      public void call(Iterator<String> iter) {
         while (iter.hasNext()) {
           iter.next();
           accum.add(1);
@@ -301,7 +305,7 @@ public void zipWithUniqueId() {
     List<Integer> dataArray = Arrays.asList(1, 2, 3, 4);
     JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithUniqueId();
     JavaRDD<Long> indexes = zip.values();
-    Assert.assertEquals(4, new HashSet<Long>(indexes.collect()).size());
+    Assert.assertEquals(4, new HashSet<>(indexes.collect()).size());
   }
 
   @Test
@@ -317,10 +321,10 @@ public void zipWithIndex() {
   @Test
   public void lookup() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
-      ));
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
+    ));
     Assert.assertEquals(2, categories.lookup("Oranges").size());
     Assert.assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
   }
@@ -390,18 +394,17 @@ public String call(Tuple2<Integer, Integer> x) {
   @Test
   public void cogroup() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
       ));
     JavaPairRDD<String, Integer> prices = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 2),
-      new Tuple2<String, Integer>("Apples", 3)
+      new Tuple2<>("Oranges", 2),
+      new Tuple2<>("Apples", 3)
     ));
     JavaPairRDD<String, Tuple2<Iterable<String>, Iterable<Integer>>> cogrouped =
         categories.cogroup(prices);
-    Assert.assertEquals("[Fruit, Citrus]",
-                        Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+    Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
     Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
 
     cogrouped.collect();
@@ -411,23 +414,22 @@ public void cogroup() {
   @Test
   public void cogroup3() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
       ));
     JavaPairRDD<String, Integer> prices = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 2),
-      new Tuple2<String, Integer>("Apples", 3)
+      new Tuple2<>("Oranges", 2),
+      new Tuple2<>("Apples", 3)
     ));
     JavaPairRDD<String, Integer> quantities = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 21),
-      new Tuple2<String, Integer>("Apples", 42)
+      new Tuple2<>("Oranges", 21),
+      new Tuple2<>("Apples", 42)
     ));
 
     JavaPairRDD<String, Tuple3<Iterable<String>, Iterable<Integer>, Iterable<Integer>>> cogrouped =
         categories.cogroup(prices, quantities);
-    Assert.assertEquals("[Fruit, Citrus]",
-                        Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+    Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
     Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
     Assert.assertEquals("[42]", Iterables.toString(cogrouped.lookup("Apples").get(0)._3()));
 
@@ -439,27 +441,26 @@ public void cogroup3() {
   @Test
   public void cogroup4() {
     JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Apples", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Fruit"),
-      new Tuple2<String, String>("Oranges", "Citrus")
+      new Tuple2<>("Apples", "Fruit"),
+      new Tuple2<>("Oranges", "Fruit"),
+      new Tuple2<>("Oranges", "Citrus")
       ));
     JavaPairRDD<String, Integer> prices = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 2),
-      new Tuple2<String, Integer>("Apples", 3)
+      new Tuple2<>("Oranges", 2),
+      new Tuple2<>("Apples", 3)
     ));
     JavaPairRDD<String, Integer> quantities = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, Integer>("Oranges", 21),
-      new Tuple2<String, Integer>("Apples", 42)
+      new Tuple2<>("Oranges", 21),
+      new Tuple2<>("Apples", 42)
     ));
     JavaPairRDD<String, String> countries = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<String, String>("Oranges", "BR"),
-      new Tuple2<String, String>("Apples", "US")
+      new Tuple2<>("Oranges", "BR"),
+      new Tuple2<>("Apples", "US")
     ));
 
     JavaPairRDD<String, Tuple4<Iterable<String>, Iterable<Integer>, Iterable<Integer>, Iterable<String>>> cogrouped =
         categories.cogroup(prices, quantities, countries);
-    Assert.assertEquals("[Fruit, Citrus]",
-                        Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+    Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
     Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
     Assert.assertEquals("[42]", Iterables.toString(cogrouped.lookup("Apples").get(0)._3()));
     Assert.assertEquals("[BR]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._4()));
@@ -471,16 +472,16 @@ public void cogroup4() {
   @Test
   public void leftOuterJoin() {
     JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<Integer, Integer>(1, 1),
-      new Tuple2<Integer, Integer>(1, 2),
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(3, 1)
+      new Tuple2<>(1, 1),
+      new Tuple2<>(1, 2),
+      new Tuple2<>(2, 1),
+      new Tuple2<>(3, 1)
       ));
     JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
-      new Tuple2<Integer, Character>(1, 'x'),
-      new Tuple2<Integer, Character>(2, 'y'),
-      new Tuple2<Integer, Character>(2, 'z'),
-      new Tuple2<Integer, Character>(4, 'w')
+      new Tuple2<>(1, 'x'),
+      new Tuple2<>(2, 'y'),
+      new Tuple2<>(2, 'z'),
+      new Tuple2<>(4, 'w')
     ));
     List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
       rdd1.leftOuterJoin(rdd2).collect();
@@ -548,11 +549,11 @@ public Integer call(Integer a, Integer b) {
   public void aggregateByKey() {
     JavaPairRDD<Integer, Integer> pairs = sc.parallelizePairs(
       Arrays.asList(
-        new Tuple2<Integer, Integer>(1, 1),
-        new Tuple2<Integer, Integer>(1, 1),
-        new Tuple2<Integer, Integer>(3, 2),
-        new Tuple2<Integer, Integer>(5, 1),
-        new Tuple2<Integer, Integer>(5, 3)), 2);
+        new Tuple2<>(1, 1),
+        new Tuple2<>(1, 1),
+        new Tuple2<>(3, 2),
+        new Tuple2<>(5, 1),
+        new Tuple2<>(5, 3)), 2);
 
     Map<Integer, Set<Integer>> sets = pairs.aggregateByKey(new HashSet<Integer>(),
       new Function2<Set<Integer>, Integer, Set<Integer>>() {
@@ -570,20 +571,20 @@ public Set<Integer> call(Set<Integer> a, Set<Integer> b) {
         }
       }).collectAsMap();
     Assert.assertEquals(3, sets.size());
-    Assert.assertEquals(new HashSet<Integer>(Arrays.asList(1)), sets.get(1));
-    Assert.assertEquals(new HashSet<Integer>(Arrays.asList(2)), sets.get(3));
-    Assert.assertEquals(new HashSet<Integer>(Arrays.asList(1, 3)), sets.get(5));
+    Assert.assertEquals(new HashSet<>(Arrays.asList(1)), sets.get(1));
+    Assert.assertEquals(new HashSet<>(Arrays.asList(2)), sets.get(3));
+    Assert.assertEquals(new HashSet<>(Arrays.asList(1, 3)), sets.get(5));
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void foldByKey() {
     List<Tuple2<Integer, Integer>> pairs = Arrays.asList(
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(1, 1),
-      new Tuple2<Integer, Integer>(3, 2),
-      new Tuple2<Integer, Integer>(3, 1)
+      new Tuple2<>(2, 1),
+      new Tuple2<>(2, 1),
+      new Tuple2<>(1, 1),
+      new Tuple2<>(3, 2),
+      new Tuple2<>(3, 1)
     );
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0,
@@ -602,11 +603,11 @@ public Integer call(Integer a, Integer b) {
   @Test
   public void reduceByKey() {
     List<Tuple2<Integer, Integer>> pairs = Arrays.asList(
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(2, 1),
-      new Tuple2<Integer, Integer>(1, 1),
-      new Tuple2<Integer, Integer>(3, 2),
-      new Tuple2<Integer, Integer>(3, 1)
+      new Tuple2<>(2, 1),
+      new Tuple2<>(2, 1),
+      new Tuple2<>(1, 1),
+      new Tuple2<>(3, 2),
+      new Tuple2<>(3, 1)
     );
     JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs);
     JavaPairRDD<Integer, Integer> counts = rdd.reduceByKey(
@@ -690,7 +691,7 @@ public void cartesian() {
     JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
     JavaRDD<String> stringRDD = sc.parallelize(Arrays.asList("Hello", "World"));
     JavaPairRDD<String, Double> cartesian = stringRDD.cartesian(doubleRDD);
-    Assert.assertEquals(new Tuple2<String, Double>("Hello", 1.0), cartesian.first());
+    Assert.assertEquals(new Tuple2<>("Hello", 1.0), cartesian.first());
   }
 
   @Test
@@ -743,6 +744,7 @@ public void javaDoubleRDDHistoGram() {
   }
 
   private static class DoubleComparator implements Comparator<Double>, Serializable {
+    @Override
     public int compare(Double o1, Double o2) {
       return o1.compareTo(o2);
     }
@@ -766,14 +768,14 @@ public void min() {
   public void naturalMax() {
     JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0));
     double max = rdd.max();
-    Assert.assertTrue(4.0 == max);
+    Assert.assertEquals(4.0, max, 0.0);
   }
 
   @Test
   public void naturalMin() {
     JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0));
     double max = rdd.min();
-    Assert.assertTrue(1.0 == max);
+    Assert.assertEquals(1.0, max, 0.0);
   }
 
   @Test
@@ -809,7 +811,7 @@ public void reduceOnJavaDoubleRDD() {
     JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0));
     double sum = rdd.reduce(new Function2<Double, Double, Double>() {
       @Override
-      public Double call(Double v1, Double v2) throws Exception {
+      public Double call(Double v1, Double v2) {
         return v1 + v2;
       }
     });
@@ -844,7 +846,7 @@ public double call(Integer x) {
         new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer x) {
-            return new Tuple2<Integer, Integer>(x, x);
+            return new Tuple2<>(x, x);
           }
         }).cache();
     pairs.collect();
@@ -870,26 +872,25 @@ public Iterable<String> call(String x) {
     Assert.assertEquals("Hello", words.first());
     Assert.assertEquals(11, words.count());
 
-    JavaPairRDD<String, String> pairs = rdd.flatMapToPair(
+    JavaPairRDD<String, String> pairsRDD = rdd.flatMapToPair(
       new PairFlatMapFunction<String, String, String>() {
-
         @Override
         public Iterable<Tuple2<String, String>> call(String s) {
-          List<Tuple2<String, String>> pairs = new LinkedList<Tuple2<String, String>>();
+          List<Tuple2<String, String>> pairs = new LinkedList<>();
           for (String word : s.split(" ")) {
-            pairs.add(new Tuple2<String, String>(word, word));
+            pairs.add(new Tuple2<>(word, word));
           }
           return pairs;
         }
       }
     );
-    Assert.assertEquals(new Tuple2<String, String>("Hello", "Hello"), pairs.first());
-    Assert.assertEquals(11, pairs.count());
+    Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairsRDD.first());
+    Assert.assertEquals(11, pairsRDD.count());
 
     JavaDoubleRDD doubles = rdd.flatMapToDouble(new DoubleFlatMapFunction<String>() {
       @Override
       public Iterable<Double> call(String s) {
-        List<Double> lengths = new LinkedList<Double>();
+        List<Double> lengths = new LinkedList<>();
         for (String word : s.split(" ")) {
           lengths.add((double) word.length());
         }
@@ -897,36 +898,36 @@ public Iterable<Double> call(String s) {
       }
     });
     Assert.assertEquals(5.0, doubles.first(), 0.01);
-    Assert.assertEquals(11, pairs.count());
+    Assert.assertEquals(11, pairsRDD.count());
   }
 
   @SuppressWarnings("unchecked")
   @Test
   public void mapsFromPairsToPairs() {
-      List<Tuple2<Integer, String>> pairs = Arrays.asList(
-              new Tuple2<Integer, String>(1, "a"),
-              new Tuple2<Integer, String>(2, "aa"),
-              new Tuple2<Integer, String>(3, "aaa")
-      );
-      JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
-
-      // Regression test for SPARK-668:
-      JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
-          new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
-          @Override
-          public Iterable<Tuple2<String, Integer>> call(Tuple2<Integer, String> item) {
-              return Collections.singletonList(item.swap());
-          }
+    List<Tuple2<Integer, String>> pairs = Arrays.asList(
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
+    );
+    JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
+
+    // Regression test for SPARK-668:
+    JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
+      new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
+        @Override
+        public Iterable<Tuple2<String, Integer>> call(Tuple2<Integer, String> item) {
+          return Collections.singletonList(item.swap());
+        }
       });
-      swapped.collect();
+    swapped.collect();
 
-      // There was never a bug here, but it's worth testing:
-      pairRDD.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
-          @Override
-          public Tuple2<String, Integer> call(Tuple2<Integer, String> item) {
-              return item.swap();
-          }
-      }).collect();
+    // There was never a bug here, but it's worth testing:
+    pairRDD.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
+      @Override
+      public Tuple2<String, Integer> call(Tuple2<Integer, String> item) {
+        return item.swap();
+      }
+    }).collect();
   }
 
   @Test
@@ -953,7 +954,7 @@ public void mapPartitionsWithIndex() {
     JavaRDD<Integer> partitionSums = rdd.mapPartitionsWithIndex(
       new Function2<Integer, Iterator<Integer>, Iterator<Integer>>() {
         @Override
-        public Iterator<Integer> call(Integer index, Iterator<Integer> iter) throws Exception {
+        public Iterator<Integer> call(Integer index, Iterator<Integer> iter) {
           int sum = 0;
           while (iter.hasNext()) {
             sum += iter.next();
@@ -972,8 +973,8 @@ public void repartition() {
     JavaRDD<Integer> repartitioned1 = in1.repartition(4);
     List<List<Integer>> result1 = repartitioned1.glom().collect();
     Assert.assertEquals(4, result1.size());
-    for (List<Integer> l: result1) {
-      Assert.assertTrue(l.size() > 0);
+    for (List<Integer> l : result1) {
+      Assert.assertFalse(l.isEmpty());
     }
 
     // Growing number of partitions
@@ -982,7 +983,7 @@ public void repartition() {
     List<List<Integer>> result2 = repartitioned2.glom().collect();
     Assert.assertEquals(2, result2.size());
     for (List<Integer> l: result2) {
-      Assert.assertTrue(l.size() > 0);
+      Assert.assertFalse(l.isEmpty());
     }
   }
 
@@ -994,9 +995,9 @@ public void persist() {
     Assert.assertEquals(20, doubleRDD.sum(), 0.1);
 
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
     pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY());
@@ -1046,7 +1047,7 @@ public void wholeTextFiles() throws Exception {
     Files.write(content1, new File(tempDirName + "/part-00000"));
     Files.write(content2, new File(tempDirName + "/part-00001"));
 
-    Map<String, String> container = new HashMap<String, String>();
+    Map<String, String> container = new HashMap<>();
     container.put(tempDirName+"/part-00000", new Text(content1).toString());
     container.put(tempDirName+"/part-00001", new Text(content2).toString());
 
@@ -1075,16 +1076,16 @@ public void textFilesCompressed() throws IOException {
   public void sequenceFile() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 
@@ -1093,7 +1094,7 @@ public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
       Text.class).mapToPair(new PairFunction<Tuple2<IntWritable, Text>, Integer, String>() {
       @Override
       public Tuple2<Integer, String> call(Tuple2<IntWritable, Text> pair) {
-        return new Tuple2<Integer, String>(pair._1().get(), pair._2().toString());
+        return new Tuple2<>(pair._1().get(), pair._2().toString());
       }
     });
     Assert.assertEquals(pairs, readRDD.collect());
@@ -1110,7 +1111,7 @@ public void binaryFiles() throws Exception {
     FileOutputStream fos1 = new FileOutputStream(file1);
 
     FileChannel channel1 = fos1.getChannel();
-    ByteBuffer bbuf = java.nio.ByteBuffer.wrap(content1);
+    ByteBuffer bbuf = ByteBuffer.wrap(content1);
     channel1.write(bbuf);
     channel1.close();
     JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName, 3);
@@ -1131,14 +1132,14 @@ public void binaryFilesCaching() throws Exception {
     FileOutputStream fos1 = new FileOutputStream(file1);
 
     FileChannel channel1 = fos1.getChannel();
-    ByteBuffer bbuf = java.nio.ByteBuffer.wrap(content1);
+    ByteBuffer bbuf = ByteBuffer.wrap(content1);
     channel1.write(bbuf);
     channel1.close();
 
     JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName).cache();
     readRDD.foreach(new VoidFunction<Tuple2<String,PortableDataStream>>() {
       @Override
-      public void call(Tuple2<String, PortableDataStream> pair) throws Exception {
+      public void call(Tuple2<String, PortableDataStream> pair) {
         pair._2().toArray(); // force the file to read
       }
     });
@@ -1162,7 +1163,7 @@ public void binaryRecords() throws Exception {
     FileChannel channel1 = fos1.getChannel();
 
     for (int i = 0; i < numOfCopies; i++) {
-      ByteBuffer bbuf = java.nio.ByteBuffer.wrap(content1);
+      ByteBuffer bbuf = ByteBuffer.wrap(content1);
       channel1.write(bbuf);
     }
     channel1.close();
@@ -1180,24 +1181,23 @@ public void binaryRecords() throws Exception {
   public void writeWithNewAPIHadoopFile() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
-    }).saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
-      org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
+    }).saveAsNewAPIHadoopFile(
+        outputDir, IntWritable.class, Text.class,
+        org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 
-    JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class,
-      Text.class);
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-      String>() {
+    JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class);
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1210,24 +1210,23 @@ public String call(Tuple2<IntWritable, Text> x) {
   public void readWithNewAPIHadoopFile() throws IOException {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 
     JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
-      org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class, IntWritable.class,
-      Text.class, new Job().getConfiguration());
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-      String>() {
+        org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
+        IntWritable.class, Text.class, new Job().getConfiguration());
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1251,9 +1250,9 @@ public void objectFilesOfInts() {
   public void objectFilesOfComplexTypes() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
     rdd.saveAsObjectFile(outputDir);
@@ -1267,23 +1266,22 @@ public void objectFilesOfComplexTypes() {
   public void hadoopFile() {
     String outputDir = new File(tempDir, "output").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-      new Tuple2<Integer, String>(1, "a"),
-      new Tuple2<Integer, String>(2, "aa"),
-      new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 
     JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
-      SequenceFileInputFormat.class, IntWritable.class, Text.class);
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-      String>() {
+        SequenceFileInputFormat.class, IntWritable.class, Text.class);
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1296,16 +1294,16 @@ public String call(Tuple2<IntWritable, Text> x) {
   public void hadoopFileCompressed() {
     String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
     List<Tuple2<Integer, String>> pairs = Arrays.asList(
-        new Tuple2<Integer, String>(1, "a"),
-        new Tuple2<Integer, String>(2, "aa"),
-        new Tuple2<Integer, String>(3, "aaa")
+      new Tuple2<>(1, "a"),
+      new Tuple2<>(2, "aa"),
+      new Tuple2<>(3, "aaa")
     );
     JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 
     rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
       @Override
       public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
-        return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
+        return new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()));
       }
     }).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class,
         DefaultCodec.class);
@@ -1313,8 +1311,7 @@ public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
     JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
         SequenceFileInputFormat.class, IntWritable.class, Text.class);
 
-    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
-        String>() {
+    Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>, String>() {
       @Override
       public String call(Tuple2<IntWritable, Text> x) {
         return x.toString();
@@ -1414,8 +1411,8 @@ public String call(Integer t) {
         return t.toString();
       }
     }).collect();
-    Assert.assertEquals(new Tuple2<String, Integer>("1", 1), s.get(0));
-    Assert.assertEquals(new Tuple2<String, Integer>("2", 2), s.get(1));
+    Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
+    Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
   }
 
   @Test
@@ -1448,20 +1445,20 @@ public void combineByKey() {
     JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
     Function<Integer, Integer> keyFunction = new Function<Integer, Integer>() {
       @Override
-      public Integer call(Integer v1) throws Exception {
+      public Integer call(Integer v1) {
         return v1 % 3;
       }
     };
     Function<Integer, Integer> createCombinerFunction = new Function<Integer, Integer>() {
       @Override
-      public Integer call(Integer v1) throws Exception {
+      public Integer call(Integer v1) {
         return v1;
       }
     };
 
     Function2<Integer, Integer, Integer> mergeValueFunction = new Function2<Integer, Integer, Integer>() {
       @Override
-      public Integer call(Integer v1, Integer v2) throws Exception {
+      public Integer call(Integer v1, Integer v2) {
         return v1 + v2;
       }
     };
@@ -1496,21 +1493,21 @@ public void mapOnPairRDD() {
         new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer i) {
-            return new Tuple2<Integer, Integer>(i, i % 2);
+            return new Tuple2<>(i, i % 2);
           }
         });
     JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(
         new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
-      @Override
-      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) {
-        return new Tuple2<Integer, Integer>(in._2(), in._1());
-      }
-    });
+          @Override
+          public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) {
+            return new Tuple2<>(in._2(), in._1());
+          }
+        });
     Assert.assertEquals(Arrays.asList(
-        new Tuple2<Integer, Integer>(1, 1),
-        new Tuple2<Integer, Integer>(0, 2),
-        new Tuple2<Integer, Integer>(1, 3),
-        new Tuple2<Integer, Integer>(0, 4)), rdd3.collect());
+        new Tuple2<>(1, 1),
+        new Tuple2<>(0, 2),
+        new Tuple2<>(1, 3),
+        new Tuple2<>(0, 4)), rdd3.collect());
 
   }
 
@@ -1523,7 +1520,7 @@ public void collectPartitions() {
         new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer i) {
-            return new Tuple2<Integer, Integer>(i, i % 2);
+            return new Tuple2<>(i, i % 2);
           }
         });
 
@@ -1534,23 +1531,23 @@ public Tuple2<Integer, Integer> call(Integer i) {
     Assert.assertEquals(Arrays.asList(3, 4), parts[0]);
     Assert.assertEquals(Arrays.asList(5, 6, 7), parts[1]);
 
-    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(1, 1),
-                                      new Tuple2<Integer, Integer>(2, 0)),
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(1, 1),
+                                      new Tuple2<>(2, 0)),
                         rdd2.collectPartitions(new int[] {0})[0]);
 
     List<Tuple2<Integer,Integer>>[] parts2 = rdd2.collectPartitions(new int[] {1, 2});
-    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(3, 1),
-                                      new Tuple2<Integer, Integer>(4, 0)),
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1),
+                                      new Tuple2<>(4, 0)),
                         parts2[0]);
-    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(5, 1),
-                                      new Tuple2<Integer, Integer>(6, 0),
-                                      new Tuple2<Integer, Integer>(7, 1)),
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(5, 1),
+                                      new Tuple2<>(6, 0),
+                                      new Tuple2<>(7, 1)),
                         parts2[1]);
   }
 
   @Test
   public void countApproxDistinct() {
-    List<Integer> arrayData = new ArrayList<Integer>();
+    List<Integer> arrayData = new ArrayList<>();
     int size = 100;
     for (int i = 0; i < 100000; i++) {
       arrayData.add(i % size);
@@ -1561,15 +1558,15 @@ public void countApproxDistinct() {
 
   @Test
   public void countApproxDistinctByKey() {
-    List<Tuple2<Integer, Integer>> arrayData = new ArrayList<Tuple2<Integer, Integer>>();
+    List<Tuple2<Integer, Integer>> arrayData = new ArrayList<>();
     for (int i = 10; i < 100; i++) {
       for (int j = 0; j < i; j++) {
-        arrayData.add(new Tuple2<Integer, Integer>(i, j));
+        arrayData.add(new Tuple2<>(i, j));
       }
     }
     double relativeSD = 0.001;
     JavaPairRDD<Integer, Integer> pairRdd = sc.parallelizePairs(arrayData);
-    List<Tuple2<Integer, Object>> res =  pairRdd.countApproxDistinctByKey(8, 0).collect();
+    List<Tuple2<Integer, Object>> res =  pairRdd.countApproxDistinctByKey(relativeSD, 8).collect();
     for (Tuple2<Integer, Object> resItem : res) {
       double count = (double)resItem._1();
       Long resCount = (Long)resItem._2();
@@ -1587,7 +1584,7 @@ public void collectAsMapWithIntArrayValues() {
         new PairFunction<Integer, Integer, int[]>() {
           @Override
           public Tuple2<Integer, int[]> call(Integer x) {
-            return new Tuple2<Integer, int[]>(x, new int[] { x });
+            return new Tuple2<>(x, new int[]{x});
           }
         });
     pairRDD.collect();  // Works fine
@@ -1598,7 +1595,7 @@ public Tuple2<Integer, int[]> call(Integer x) {
   @Test
   public void collectAsMapAndSerialize() throws Exception {
     JavaPairRDD<String,Integer> rdd =
-        sc.parallelizePairs(Arrays.asList(new Tuple2<String,Integer>("foo", 1)));
+        sc.parallelizePairs(Arrays.asList(new Tuple2<>("foo", 1)));
     Map<String,Integer> map = rdd.collectAsMap();
     ByteArrayOutputStream bytes = new ByteArrayOutputStream();
     new ObjectOutputStream(bytes).writeObject(map);
@@ -1615,7 +1612,7 @@ public void sampleByKey() {
       new PairFunction<Integer, Integer, Integer>() {
         @Override
         public Tuple2<Integer, Integer> call(Integer i) {
-          return new Tuple2<Integer, Integer>(i % 2, 1);
+          return new Tuple2<>(i % 2, 1);
         }
       });
     Map<Integer, Object> fractions = Maps.newHashMap();
@@ -1623,12 +1620,12 @@ public Tuple2<Integer, Integer> call(Integer i) {
     fractions.put(1, 1.0);
     JavaPairRDD<Integer, Integer> wr = rdd2.sampleByKey(true, fractions, 1L);
     Map<Integer, Long> wrCounts = (Map<Integer, Long>) (Object) wr.countByKey();
-    Assert.assertTrue(wrCounts.size() == 2);
+    Assert.assertEquals(2, wrCounts.size());
     Assert.assertTrue(wrCounts.get(0) > 0);
     Assert.assertTrue(wrCounts.get(1) > 0);
     JavaPairRDD<Integer, Integer> wor = rdd2.sampleByKey(false, fractions, 1L);
     Map<Integer, Long> worCounts = (Map<Integer, Long>) (Object) wor.countByKey();
-    Assert.assertTrue(worCounts.size() == 2);
+    Assert.assertEquals(2, worCounts.size());
     Assert.assertTrue(worCounts.get(0) > 0);
     Assert.assertTrue(worCounts.get(1) > 0);
   }
@@ -1641,7 +1638,7 @@ public void sampleByKeyExact() {
       new PairFunction<Integer, Integer, Integer>() {
           @Override
           public Tuple2<Integer, Integer> call(Integer i) {
-              return new Tuple2<Integer, Integer>(i % 2, 1);
+              return new Tuple2<>(i % 2, 1);
           }
       });
     Map<Integer, Object> fractions = Maps.newHashMap();
@@ -1649,25 +1646,25 @@ public Tuple2<Integer, Integer> call(Integer i) {
     fractions.put(1, 1.0);
     JavaPairRDD<Integer, Integer> wrExact = rdd2.sampleByKeyExact(true, fractions, 1L);
     Map<Integer, Long> wrExactCounts = (Map<Integer, Long>) (Object) wrExact.countByKey();
-    Assert.assertTrue(wrExactCounts.size() == 2);
+    Assert.assertEquals(2, wrExactCounts.size());
     Assert.assertTrue(wrExactCounts.get(0) == 2);
     Assert.assertTrue(wrExactCounts.get(1) == 4);
     JavaPairRDD<Integer, Integer> worExact = rdd2.sampleByKeyExact(false, fractions, 1L);
     Map<Integer, Long> worExactCounts = (Map<Integer, Long>) (Object) worExact.countByKey();
-    Assert.assertTrue(worExactCounts.size() == 2);
+    Assert.assertEquals(2, worExactCounts.size());
     Assert.assertTrue(worExactCounts.get(0) == 2);
     Assert.assertTrue(worExactCounts.get(1) == 4);
   }
 
   private static class SomeCustomClass implements Serializable {
-    public SomeCustomClass() {
+    SomeCustomClass() {
       // Intentionally left blank
     }
   }
 
   @Test
   public void collectUnderlyingScalaRDD() {
-    List<SomeCustomClass> data = new ArrayList<SomeCustomClass>();
+    List<SomeCustomClass> data = new ArrayList<>();
     for (int i = 0; i < 100; i++) {
       data.add(new SomeCustomClass());
     }
@@ -1679,7 +1676,7 @@ public void collectUnderlyingScalaRDD() {
   private static final class BuggyMapFunction<T> implements Function<T, T> {
 
     @Override
-    public T call(T x) throws Exception {
+    public T call(T x) {
       throw new IllegalStateException("Custom exception!");
     }
   }
@@ -1716,7 +1713,7 @@ public void foreachAsync() throws Exception {
     JavaFutureAction<Void> future = rdd.foreachAsync(
         new VoidFunction<Integer>() {
           @Override
-          public void call(Integer integer) throws Exception {
+          public void call(Integer integer) {
             // intentionally left blank.
           }
         }
@@ -1745,7 +1742,7 @@ public void testAsyncActionCancellation() throws Exception {
     JavaRDD<Integer> rdd = sc.parallelize(data, 1);
     JavaFutureAction<Void> future = rdd.foreachAsync(new VoidFunction<Integer>() {
       @Override
-      public void call(Integer integer) throws Exception {
+      public void call(Integer integer) throws InterruptedException {
         Thread.sleep(10000);  // To ensure that the job won't finish before it's cancelled.
       }
     });
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 94650be536b5f..d218344cd4520 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -129,7 +129,6 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
         Object[] args = invocationOnMock.getArguments();
 
         return new DiskBlockObjectWriter(
-          (BlockId) args[0],
           (File) args[1],
           (SerializerInstance) args[2],
           (Integer) args[3],
@@ -530,8 +529,9 @@ public void testPeakMemoryUsed() throws Exception {
       for (int i = 0; i < numRecordsPerPage * 10; i++) {
         writer.insertRecordIntoSorter(new Tuple2<Object, Object>(1, 1));
         newPeakMemory = writer.getPeakMemoryUsedBytes();
-        if (i % numRecordsPerPage == 0) {
-          // We allocated a new page for this record, so peak memory should change
+        if (i % numRecordsPerPage == 0 && i != 0) {
+          // The first page is allocated in constructor, another page will be allocated after
+          // every numRecordsPerPage records (peak memory should change).
           assertEquals(previousPeakMemory + pageSizeBytes, newPeakMemory);
         } else {
           assertEquals(previousPeakMemory, newPeakMemory);
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index 445a37b83e98a..a5bbaa95fa456 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -127,7 +127,6 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
         Object[] args = invocationOnMock.getArguments();
 
         return new DiskBlockObjectWriter(
-          (BlockId) args[0],
           (File) args[1],
           (SerializerInstance) args[2],
           (Integer) args[3],
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index d343bb95cb68c..4d70bfed909b6 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -483,7 +483,7 @@ class FatPairRDD(parent: RDD[Int], _partitioner: Partitioner) extends RDD[(Int,
 object CheckpointSuite {
   // This is a custom cogroup function that does not use mapValues like
   // the PairRDDFunctions.cogroup()
-  def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner)
+  def cogroup[K: ClassTag, V: ClassTag](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner)
     : RDD[(K, Array[Iterable[V]])] = {
     new CoGroupedRDD[K](
       Seq(first.asInstanceOf[RDD[(K, _)]], second.asInstanceOf[RDD[(K, _)]]),
diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index e846a72c888c6..231f4631e0a47 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -61,7 +61,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
     // local blocks from the local BlockManager and won't send requests to ExternalShuffleService.
     // In this case, we won't receive FetchFailed. And it will make this test fail.
     // Therefore, we should wait until all slaves are up
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
+    sc.jobProgressListener.waitUntilExecutorsUp(2, 60000)
 
     val rdd = sc.parallelize(0 until 1000, 10).map(i => (i, 1)).reduceByKey(_ + _)
 
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index aa50a49c50232..f58756e6f6179 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -217,6 +217,27 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     FailureSuiteState.clear()
   }
 
+  // Run a 3-task map stage where one task fails once.
+  test("failure in tasks in a submitMapStage") {
+    sc = new SparkContext("local[1,2]", "test")
+    val rdd = sc.makeRDD(1 to 3, 3).map { x =>
+      FailureSuiteState.synchronized {
+        FailureSuiteState.tasksRun += 1
+        if (x == 1 && FailureSuiteState.tasksFailed == 0) {
+          FailureSuiteState.tasksFailed += 1
+          throw new Exception("Intentional task failure")
+        }
+      }
+      (x, x)
+    }
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(2))
+    sc.submitMapStage(dep).get()
+    FailureSuiteState.synchronized {
+      assert(FailureSuiteState.tasksRun === 4)
+    }
+    FailureSuiteState.clear()
+  }
+
   // TODO: Need to add tests with shuffle fetch failures.
 }
 
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 418763f4e5ffa..fdb00aafc4a48 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import java.io.{File, FileWriter}
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.input.PortableDataStream
 import org.apache.spark.storage.StorageLevel
 
@@ -506,8 +507,9 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     job.setOutputKeyClass(classOf[String])
     job.setOutputValueClass(classOf[String])
     job.setOutputFormatClass(classOf[NewTextOutputFormat[String, String]])
-    job.getConfiguration.set("mapred.output.dir", tempDir.getPath + "/outputDataset_new")
-    randomRDD.saveAsNewAPIHadoopDataset(job.getConfiguration)
+    val jobConfig = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
+    jobConfig.set("mapred.output.dir", tempDir.getPath + "/outputDataset_new")
+    randomRDD.saveAsNewAPIHadoopDataset(jobConfig)
     assert(new File(tempDir.getPath + "/outputDataset_new/part-r-00000").exists() === true)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index 139b8dc25f4b4..18f2229fea39b 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -19,7 +19,10 @@ package org.apache.spark
 
 import java.util.concurrent.{ExecutorService, TimeUnit}
 
+import scala.collection.Map
 import scala.collection.mutable
+import scala.concurrent.Await
+import scala.concurrent.duration._
 import scala.language.postfixOps
 
 import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester}
@@ -96,18 +99,18 @@ class HeartbeatReceiverSuite
 
   test("normal heartbeat") {
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
-    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
+    val trackedExecutors = getTrackedExecutors
     assert(trackedExecutors.size === 2)
     assert(trackedExecutors.contains(executorId1))
     assert(trackedExecutors.contains(executorId2))
   }
 
   test("reregister if scheduler is not ready yet") {
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
+    addExecutorAndVerify(executorId1)
     // Task scheduler is not set yet in HeartbeatReceiver, so executors should reregister
     triggerHeartbeat(executorId1, executorShouldReregister = true)
   }
@@ -116,20 +119,20 @@ class HeartbeatReceiverSuite
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
     // Received heartbeat from unknown executor, so we ask it to re-register
     triggerHeartbeat(executorId1, executorShouldReregister = true)
-    assert(heartbeatReceiver.invokePrivate(_executorLastSeen()).isEmpty)
+    assert(getTrackedExecutors.isEmpty)
   }
 
   test("reregister if heartbeat from removed executor") {
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     // Remove the second executor but not the first
-    heartbeatReceiver.onExecutorRemoved(SparkListenerExecutorRemoved(0, executorId2, "bad boy"))
+    removeExecutorAndVerify(executorId2)
     // Now trigger the heartbeats
     // A heartbeat from the second executor should require reregistering
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = true)
-    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
+    val trackedExecutors = getTrackedExecutors
     assert(trackedExecutors.size === 1)
     assert(trackedExecutors.contains(executorId1))
     assert(!trackedExecutors.contains(executorId2))
@@ -138,8 +141,8 @@ class HeartbeatReceiverSuite
   test("expire dead hosts") {
     val executorTimeout = heartbeatReceiver.invokePrivate(_executorTimeoutMs())
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
     // Advance the clock and only trigger a heartbeat for the first executor
@@ -149,7 +152,7 @@ class HeartbeatReceiverSuite
     heartbeatReceiverRef.askWithRetry[Boolean](ExpireDeadHosts)
     // Only the second executor should be expired as a dead host
     verify(scheduler).executorLost(Matchers.eq(executorId2), any())
-    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
+    val trackedExecutors = getTrackedExecutors
     assert(trackedExecutors.size === 1)
     assert(trackedExecutors.contains(executorId1))
     assert(!trackedExecutors.contains(executorId2))
@@ -175,8 +178,8 @@ class HeartbeatReceiverSuite
     fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
       RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "dummy:4040", 0, Map.empty))
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    addExecutorAndVerify(executorId1)
+    addExecutorAndVerify(executorId2)
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
 
@@ -222,6 +225,26 @@ class HeartbeatReceiverSuite
     }
   }
 
+  private def addExecutorAndVerify(executorId: String): Unit = {
+    assert(
+      heartbeatReceiver.addExecutor(executorId).map { f =>
+        Await.result(f, 10.seconds)
+      } === Some(true))
+  }
+
+  private def removeExecutorAndVerify(executorId: String): Unit = {
+    assert(
+      heartbeatReceiver.removeExecutor(executorId).map { f =>
+        Await.result(f, 10.seconds)
+      } === Some(true))
+  }
+
+  private def getTrackedExecutors: Map[String, Long] = {
+    // We may receive undesired SparkListenerExecutorAdded from LocalBackend, so exclude it from
+    // the map. See SPARK-10800.
+    heartbeatReceiver.invokePrivate(_executorLastSeen()).
+      filterKeys(_ != SparkContext.DRIVER_IDENTIFIER)
+  }
 }
 
 // TODO: use these classes to add end-to-end tests for dynamic allocation!
diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
index f34aefca4eb18..f29160d834082 100644
--- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
@@ -125,6 +125,47 @@ class SecurityManagerSuite extends SparkFunSuite {
 
   }
 
+  test("set security with * in acls") {
+    val conf = new SparkConf
+    conf.set("spark.ui.acls.enable", "true")
+    conf.set("spark.admin.acls", "user1,user2")
+    conf.set("spark.ui.view.acls", "*")
+    conf.set("spark.modify.acls", "user4")
+
+    val securityManager = new SecurityManager(conf)
+    assert(securityManager.aclsEnabled() === true)
+
+    // check for viewAcls with *
+    assert(securityManager.checkUIViewPermissions("user1") === true)
+    assert(securityManager.checkUIViewPermissions("user5") === true)
+    assert(securityManager.checkUIViewPermissions("user6") === true)
+    assert(securityManager.checkModifyPermissions("user4") === true)
+    assert(securityManager.checkModifyPermissions("user7") === false)
+    assert(securityManager.checkModifyPermissions("user8") === false)
+
+    // check for modifyAcls with *
+    securityManager.setModifyAcls(Set("user4"), "*")
+    assert(securityManager.checkModifyPermissions("user7") === true)
+    assert(securityManager.checkModifyPermissions("user8") === true)
+
+    securityManager.setAdminAcls("user1,user2")
+    securityManager.setModifyAcls(Set("user1"), "user2")
+    securityManager.setViewAcls(Set("user1"), "user2")
+    assert(securityManager.checkUIViewPermissions("user5") === false)
+    assert(securityManager.checkUIViewPermissions("user6") === false)
+    assert(securityManager.checkModifyPermissions("user7") === false)
+    assert(securityManager.checkModifyPermissions("user8") === false)
+
+    // check for adminAcls with *
+    securityManager.setAdminAcls("user1,*")
+    securityManager.setModifyAcls(Set("user1"), "user2")
+    securityManager.setViewAcls(Set("user1"), "user2")
+    assert(securityManager.checkUIViewPermissions("user5") === true)
+    assert(securityManager.checkUIViewPermissions("user6") === true)
+    assert(securityManager.checkModifyPermissions("user7") === true)
+    assert(securityManager.checkModifyPermissions("user8") === true)
+  }
+
   test("ssl on setup") {
     val conf = SSLSampleConfigs.sparkSSLConfig()
     val expectedAlgorithms = Set(
diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
index 48509f0759a3b..54c131cdae367 100644
--- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
@@ -119,27 +119,35 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     val nums = sc.parallelize(1 to 2, 2)
     val sem = new Semaphore(0)
     ThreadingSuiteState.clear()
+    var throwable: Option[Throwable] = None
     for (i <- 0 until 2) {
       new Thread {
         override def run() {
-          val ans = nums.map(number => {
-            val running = ThreadingSuiteState.runningThreads
-            running.getAndIncrement()
-            val time = System.currentTimeMillis()
-            while (running.get() != 4 && System.currentTimeMillis() < time + 1000) {
-              Thread.sleep(100)
-            }
-            if (running.get() != 4) {
-              ThreadingSuiteState.failed.set(true)
-            }
-            number
-          }).collect()
-          assert(ans.toList === List(1, 2))
-          sem.release()
+          try {
+            val ans = nums.map(number => {
+              val running = ThreadingSuiteState.runningThreads
+              running.getAndIncrement()
+              val time = System.currentTimeMillis()
+              while (running.get() != 4 && System.currentTimeMillis() < time + 1000) {
+                Thread.sleep(100)
+              }
+              if (running.get() != 4) {
+                ThreadingSuiteState.failed.set(true)
+              }
+              number
+            }).collect()
+            assert(ans.toList === List(1, 2))
+          } catch {
+            case t: Throwable =>
+              throwable = Some(t)
+          } finally {
+            sem.release()
+          }
         }
       }.start()
     }
     sem.acquire(2)
+    throwable.foreach { t => throw improveStackTrace(t) }
     if (ThreadingSuiteState.failed.get()) {
       logError("Waited 1 second without seeing runningThreads = 4 (it was " +
                 ThreadingSuiteState.runningThreads.get() + "); failing test")
@@ -150,13 +158,19 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
   test("set local properties in different thread") {
     sc = new SparkContext("local", "test")
     val sem = new Semaphore(0)
-
+    var throwable: Option[Throwable] = None
     val threads = (1 to 5).map { i =>
       new Thread() {
         override def run() {
-          sc.setLocalProperty("test", i.toString)
-          assert(sc.getLocalProperty("test") === i.toString)
-          sem.release()
+          try {
+            sc.setLocalProperty("test", i.toString)
+            assert(sc.getLocalProperty("test") === i.toString)
+          } catch {
+            case t: Throwable =>
+              throwable = Some(t)
+          } finally {
+            sem.release()
+          }
         }
       }
     }
@@ -164,6 +178,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     threads.foreach(_.start())
 
     sem.acquire(5)
+    throwable.foreach { t => throw improveStackTrace(t) }
     assert(sc.getLocalProperty("test") === null)
   }
 
@@ -171,14 +186,20 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     sc = new SparkContext("local", "test")
     sc.setLocalProperty("test", "parent")
     val sem = new Semaphore(0)
-
+    var throwable: Option[Throwable] = None
     val threads = (1 to 5).map { i =>
       new Thread() {
         override def run() {
-          assert(sc.getLocalProperty("test") === "parent")
-          sc.setLocalProperty("test", i.toString)
-          assert(sc.getLocalProperty("test") === i.toString)
-          sem.release()
+          try {
+            assert(sc.getLocalProperty("test") === "parent")
+            sc.setLocalProperty("test", i.toString)
+            assert(sc.getLocalProperty("test") === i.toString)
+          } catch {
+            case t: Throwable =>
+              throwable = Some(t)
+          } finally {
+            sem.release()
+          }
         }
       }
     }
@@ -186,50 +207,41 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
     threads.foreach(_.start())
 
     sem.acquire(5)
+    throwable.foreach { t => throw improveStackTrace(t) }
     assert(sc.getLocalProperty("test") === "parent")
     assert(sc.getLocalProperty("Foo") === null)
   }
 
-  test("mutations to local properties should not affect submitted jobs (SPARK-6629)") {
-    val jobStarted = new Semaphore(0)
-    val jobEnded = new Semaphore(0)
-    @volatile var jobResult: JobResult = null
-
+  test("mutation in parent local property does not affect child (SPARK-10563)") {
     sc = new SparkContext("local", "test")
-    sc.setJobGroup("originalJobGroupId", "description")
-    sc.addSparkListener(new SparkListener {
-      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        jobStarted.release()
-      }
-      override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
-        jobResult = jobEnd.jobResult
-        jobEnded.release()
-      }
-    })
-
-    // Create a new thread which will inherit the current thread's properties
-    val thread = new Thread() {
+    val originalTestValue: String = "original-value"
+    var threadTestValue: String = null
+    sc.setLocalProperty("test", originalTestValue)
+    var throwable: Option[Throwable] = None
+    val thread = new Thread {
       override def run(): Unit = {
-        assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "originalJobGroupId")
-        // Sleeps for a total of 10 seconds, but allows cancellation to interrupt the task
         try {
-          sc.parallelize(1 to 100).foreach { x =>
-            Thread.sleep(100)
-          }
+          threadTestValue = sc.getLocalProperty("test")
         } catch {
-          case s: SparkException => // ignored so that we don't print noise in test logs
+          case t: Throwable =>
+            throwable = Some(t)
         }
       }
     }
+    sc.setLocalProperty("test", "this-should-not-be-inherited")
     thread.start()
-    // Wait for the job to start, then mutate the original properties, which should have been
-    // inherited by the running job but hopefully defensively copied or snapshotted:
-    jobStarted.tryAcquire(10, TimeUnit.SECONDS)
-    sc.setJobGroup("modifiedJobGroupId", "description")
-    // Canceling the original job group should cancel the running job. In other words, the
-    // modification of the properties object should not affect the properties of running jobs
-    sc.cancelJobGroup("originalJobGroupId")
-    jobEnded.tryAcquire(10, TimeUnit.SECONDS)
-    assert(jobResult.isInstanceOf[JobFailed])
+    thread.join()
+    throwable.foreach { t => throw improveStackTrace(t) }
+    assert(threadTestValue === originalTestValue)
+  }
+
+  /**
+   * Improve the stack trace of an error thrown from within a thread.
+   * Otherwise it's difficult to tell which line in the test the error came from.
+   */
+  private def improveStackTrace(t: Throwable): Throwable = {
+    t.setStackTrace(t.getStackTrace ++ Thread.currentThread.getStackTrace)
+    t
   }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index fb7a8ae3f9d41..ba21075ce6be5 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -311,7 +311,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
         new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", broadcastConf)
       // Wait until all salves are up
       try {
-        _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 10000)
+        _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 60000)
         _sc
       } catch {
         case e: Throwable =>
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 1110ca6051a40..1fd470cd3b01d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -147,7 +147,7 @@ class SparkSubmitSuite
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "beauty",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -166,7 +166,7 @@ class SparkSubmitSuite
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
     classpath should have length (0)
     sysProps("spark.app.name") should be ("beauty")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
     sysProps("SPARK_SUBMIT") should be ("true")
     sysProps.keys should not contain ("spark.jars")
   }
@@ -185,7 +185,7 @@ class SparkSubmitSuite
       "--archives", "archive1.txt,archive2.txt",
       "--num-executors", "6",
       "--name", "trill",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -206,7 +206,7 @@ class SparkSubmitSuite
     sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
     sysProps("spark.jars") should include regex (".*one.jar,.*two.jar,.*three.jar,.*thejar.jar")
     sysProps("SPARK_SUBMIT") should be ("true")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles standalone cluster mode") {
@@ -229,7 +229,7 @@ class SparkSubmitSuite
       "--supervise",
       "--driver-memory", "4g",
       "--driver-cores", "5",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -253,9 +253,9 @@ class SparkSubmitSuite
     sysProps.keys should contain ("spark.driver.memory")
     sysProps.keys should contain ("spark.driver.cores")
     sysProps.keys should contain ("spark.driver.supervise")
-    sysProps.keys should contain ("spark.shuffle.spill")
+    sysProps.keys should contain ("spark.ui.enabled")
     sysProps.keys should contain ("spark.submit.deployMode")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles standalone client mode") {
@@ -266,7 +266,7 @@ class SparkSubmitSuite
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -277,7 +277,7 @@ class SparkSubmitSuite
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles mesos client mode") {
@@ -288,7 +288,7 @@ class SparkSubmitSuite
       "--total-executor-cores", "5",
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
-      "--conf", "spark.shuffle.spill=false",
+      "--conf", "spark.ui.enabled=false",
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -299,7 +299,7 @@ class SparkSubmitSuite
     classpath(0) should endWith ("thejar.jar")
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.cores.max") should be ("5")
-    sysProps("spark.shuffle.spill") should be ("false")
+    sysProps("spark.ui.enabled") should be ("false")
   }
 
   test("handles confs with flag equivalents") {
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index 1f2a0f0d309ce..2e2fa22eb4772 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -17,10 +17,15 @@
 
 package org.apache.spark.deploy
 
+import scala.concurrent.duration._
+
 import org.mockito.Mockito.{mock, when}
 import org.scalatest.BeforeAndAfterAll
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
+import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
+import org.apache.spark.deploy.master.ApplicationInfo
 import org.apache.spark.deploy.master.Master
 import org.apache.spark.deploy.worker.Worker
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv}
@@ -56,6 +61,10 @@ class StandaloneDynamicAllocationSuite
     }
     master = makeMaster()
     workers = makeWorkers(10, 2048)
+    // Wait until all workers register with master successfully
+    eventually(timeout(60.seconds), interval(10.millis)) {
+      assert(getMasterState.workers.size === numWorkers)
+    }
   }
 
   override def afterAll(): Unit = {
@@ -73,167 +82,208 @@ class StandaloneDynamicAllocationSuite
   test("dynamic allocation default behavior") {
     sc = new SparkContext(appConf)
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // request 1 more
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 2)
     // request 1 more; this one won't go through
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 3)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 3)
     // kill all existing executors; we should end up with 3 - 2 = 1 executor
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // kill all executors again; this time we'll have 1 - 1 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with max cores <= cores per worker") {
     sc = new SparkContext(appConf.set("spark.cores.max", "8"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.executors.values.head.cores === 8)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.executors.values.head.cores === 8)
+    assert(apps.head.getExecutorLimit === 1)
     // request 1 more; this one won't go through because we're already at max cores.
     // This highlights a limitation of using dynamic allocation with max cores WITHOUT
     // setting cores per executor: once an application scales down and then scales back
     // up, its executors may not be spread out anymore!
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 2)
     // request 1 more; this one also won't go through for the same reason
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 3)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 3)
     // kill all existing executors; we should end up with 3 - 1 = 2 executor
     // Note: we scheduled these executors together, so their cores should be evenly distributed
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
+    assert(apps.head.getExecutorLimit === 2)
     // kill all executors again; this time we'll have 1 - 1 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with max cores > cores per worker") {
     sc = new SparkContext(appConf.set("spark.cores.max", "16"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.executors.values.head.cores === 10)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.executors.values.head.cores === 10)
+    assert(apps.head.getExecutorLimit === 1)
     // request 1 more
     // Note: the cores are not evenly distributed because we scheduled these executors 1 by 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toSet === Set(10, 6))
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toSet === Set(10, 6))
+    assert(apps.head.getExecutorLimit === 2)
     // request 1 more; this one won't go through
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 3)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 3)
     // kill all existing executors; we should end up with 3 - 2 = 1 executor
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.executors.values.head.cores === 10)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.executors.values.head.cores === 10)
+    assert(apps.head.getExecutorLimit === 1)
     // kill all executors again; this time we'll have 1 - 1 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with cores per executor") {
     sc = new SparkContext(appConf.set("spark.executor.cores", "2"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 10) // 20 cores total
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 10) // 20 cores total
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // request 3 more
     assert(sc.requestExecutors(3))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 4)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 4)
     // request 10 more; only 6 will go through
     assert(sc.requestExecutors(10))
-    assert(master.apps.head.executors.size === 10)
-    assert(master.apps.head.getExecutorLimit === 14)
+    apps = getApplications()
+    assert(apps.head.executors.size === 10)
+    assert(apps.head.getExecutorLimit === 14)
     // kill 2 executors; we should get 2 back immediately
     assert(killNExecutors(sc, 2))
-    assert(master.apps.head.executors.size === 10)
-    assert(master.apps.head.getExecutorLimit === 12)
+    apps = getApplications()
+    assert(apps.head.executors.size === 10)
+    assert(apps.head.getExecutorLimit === 12)
     // kill 4 executors; we should end up with 12 - 4 = 8 executors
     assert(killNExecutors(sc, 4))
-    assert(master.apps.head.executors.size === 8)
-    assert(master.apps.head.getExecutorLimit === 8)
+    apps = getApplications()
+    assert(apps.head.executors.size === 8)
+    assert(apps.head.getExecutorLimit === 8)
     // kill all executors; this time we'll have 8 - 8 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 10)
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 10)
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("dynamic allocation with cores per executor AND max cores") {
@@ -241,55 +291,70 @@ class StandaloneDynamicAllocationSuite
       .set("spark.executor.cores", "2")
       .set("spark.cores.max", "8"))
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 4) // 8 cores total
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 4) // 8 cores total
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // kill all executors
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    var apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request 1
     assert(sc.requestExecutors(1))
-    assert(master.apps.head.executors.size === 1)
-    assert(master.apps.head.getExecutorLimit === 1)
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
     // request 3 more
     assert(sc.requestExecutors(3))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 4)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 4)
     // request 10 more; none will go through
     assert(sc.requestExecutors(10))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 14)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 14)
     // kill all executors; 4 executors will be launched immediately
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 10)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 10)
     // ... and again
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 6)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 6)
     // ... and again; now we end up with 6 - 4 = 2 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === 2)
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 2)
     // ... and again; this time we have 2 - 2 = 0 executors left
     assert(killAllExecutors(sc))
-    assert(master.apps.head.executors.size === 0)
-    assert(master.apps.head.getExecutorLimit === 0)
+    apps = getApplications()
+    assert(apps.head.executors.size === 0)
+    assert(apps.head.getExecutorLimit === 0)
     // request many more; this increases the limit well beyond the cluster capacity
     assert(sc.requestExecutors(1000))
-    assert(master.apps.head.executors.size === 4)
-    assert(master.apps.head.getExecutorLimit === 1000)
+    apps = getApplications()
+    assert(apps.head.executors.size === 4)
+    assert(apps.head.getExecutorLimit === 1000)
   }
 
   test("kill the same executor twice (SPARK-9795)") {
     sc = new SparkContext(appConf)
     val appId = sc.applicationId
-    assert(master.apps.size === 1)
-    assert(master.apps.head.id === appId)
-    assert(master.apps.head.executors.size === 2)
-    assert(master.apps.head.getExecutorLimit === Int.MaxValue)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
     // sync executors between the Master and the driver, needed because
     // the driver refuses to kill executors it does not know about
     syncExecutors(sc)
@@ -298,9 +363,10 @@ class StandaloneDynamicAllocationSuite
     assert(executors.size === 2)
     assert(sc.killExecutor(executors.head))
     assert(sc.killExecutor(executors.head))
-    assert(master.apps.head.executors.size === 1)
+    val apps = getApplications()
+    assert(apps.head.executors.size === 1)
     // The limit should not be lowered twice
-    assert(master.apps.head.getExecutorLimit === 1)
+    assert(apps.head.getExecutorLimit === 1)
   }
 
   // ===============================
@@ -333,6 +399,16 @@ class StandaloneDynamicAllocationSuite
     }
   }
 
+  /** Get the Master state */
+  private def getMasterState: MasterStateResponse = {
+    master.self.askWithRetry[MasterStateResponse](RequestMasterState)
+  }
+
+  /** Get the applictions that are active from Master */
+  private def getApplications(): Seq[ApplicationInfo] = {
+    getMasterState.activeApps
+  }
+
   /** Kill all executors belonging to this application. */
   private def killAllExecutors(sc: SparkContext): Boolean = {
     killNExecutors(sc, Int.MaxValue)
@@ -352,8 +428,11 @@ class StandaloneDynamicAllocationSuite
    * don't wait for executors to register. Otherwise the tests will take much longer to run.
    */
   private def getExecutorIds(sc: SparkContext): Seq[String] = {
-    assert(master.idToApp.contains(sc.applicationId))
-    master.idToApp(sc.applicationId).executors.keys.map(_.toString).toSeq
+    val app = getApplications().find(_.id == sc.applicationId)
+    assert(app.isDefined)
+    // Although executors is transient, master is in the same process so the message won't be
+    // serialized and it's safe here.
+    app.get.executors.keys.map(_.toString).toSeq
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index 96e456d889ac3..9693e32bf6af6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -366,6 +366,18 @@ class StandaloneRestSubmitSuite extends SparkFunSuite with BeforeAndAfterEach {
     assert(conn3.getResponseCode === HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
   }
 
+  test("client does not send 'SPARK_ENV_LOADED' env var by default") {
+    val environmentVariables = Map("SPARK_VAR" -> "1", "SPARK_ENV_LOADED" -> "1")
+    val filteredVariables = RestSubmissionClient.filterSystemEnvironment(environmentVariables)
+    assert(filteredVariables == Map("SPARK_VAR" -> "1"))
+  }
+
+  test("client includes mesos env vars") {
+    val environmentVariables = Map("SPARK_VAR" -> "1", "MESOS_VAR" -> "1", "OTHER_VAR" -> "1")
+    val filteredVariables = RestSubmissionClient.filterSystemEnvironment(environmentVariables)
+    assert(filteredVariables == Map("SPARK_VAR" -> "1", "MESOS_VAR" -> "1"))
+  }
+
   /* --------------------- *
    |     Helper methods    |
    * --------------------- */
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index d3218a548efc7..44eb5a0469122 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -286,6 +286,10 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
   private def runAndReturnMetrics(job: => Unit,
       collector: (SparkListenerTaskEnd) => Option[Long]): Long = {
     val taskMetrics = new ArrayBuffer[Long]()
+
+    // Avoid receiving earlier taskEnd events
+    sc.listenerBus.waitUntilEmpty(500)
+
     sc.addSparkListener(new SparkListener() {
       override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
         collector(taskEnd).foreach(taskMetrics += _)
diff --git a/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala b/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
deleted file mode 100644
index 5e364cc0edeb2..0000000000000
--- a/core/src/test/scala/org/apache/spark/network/nio/ConnectionManagerSuite.scala
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.network.nio
-
-import java.io.IOException
-import java.nio._
-
-import scala.concurrent.duration._
-import scala.concurrent.{Await, TimeoutException}
-import scala.language.postfixOps
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.util.Utils
-
-/**
-  * Test the ConnectionManager with various security settings.
-  */
-class ConnectionManagerSuite extends SparkFunSuite {
-
-  test("security default off") {
-    val conf = new SparkConf
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var receivedMessage = false
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      receivedMessage = true
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    Await.result(manager.sendMessageReliably(manager.id, bufferMessage), 10 seconds)
-
-    assert(receivedMessage == true)
-
-    manager.stop()
-  }
-
-  test("security on same password") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "true")
-    conf.set("spark.authenticate.secret", "good")
-    conf.set("spark.app.id", "app-id")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-    val managerServer = new ConnectionManager(0, conf, securityManager)
-    var numReceivedServerMessages = 0
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val count = 10
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-
-    (0 until count).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      Await.result(manager.sendMessageReliably(managerServer.id, bufferMessage), 10 seconds)
-    })
-
-    assert(numReceivedServerMessages == 10)
-    assert(numReceivedMessages == 0)
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("security mismatch password") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "true")
-    conf.set("spark.app.id", "app-id")
-    conf.set("spark.authenticate.secret", "good")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-
-    val badconf = conf.clone.set("spark.authenticate.secret", "bad")
-    val badsecurityManager = new SecurityManager(badconf)
-    val managerServer = new ConnectionManager(0, badconf, badsecurityManager)
-    var numReceivedServerMessages = 0
-
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    // Expect managerServer to close connection, which we'll report as an error:
-    intercept[IOException] {
-      Await.result(manager.sendMessageReliably(managerServer.id, bufferMessage), 10 seconds)
-    }
-
-    assert(numReceivedServerMessages == 0)
-    assert(numReceivedMessages == 0)
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("security mismatch auth off") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "false")
-    conf.set("spark.authenticate.secret", "good")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-
-    val badconf = new SparkConf
-    badconf.set("spark.authenticate", "true")
-    badconf.set("spark.authenticate.secret", "good")
-    val badsecurityManager = new SecurityManager(badconf)
-    val managerServer = new ConnectionManager(0, badconf, badsecurityManager)
-    var numReceivedServerMessages = 0
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    (0 until 1).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      manager.sendMessageReliably(managerServer.id, bufferMessage)
-    }).foreach(f => {
-      try {
-        val g = Await.result(f, 1 second)
-        assert(false)
-      } catch {
-        case i: IOException =>
-          assert(true)
-        case e: TimeoutException => {
-          // we should timeout here since the client can't do the negotiation
-          assert(true)
-        }
-      }
-    })
-
-    assert(numReceivedServerMessages == 0)
-    assert(numReceivedMessages == 0)
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("security auth off") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "false")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    var numReceivedMessages = 0
-
-    manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedMessages += 1
-      None
-    })
-
-    val badconf = new SparkConf
-    badconf.set("spark.authenticate", "false")
-    val badsecurityManager = new SecurityManager(badconf)
-    val managerServer = new ConnectionManager(0, badconf, badsecurityManager)
-    var numReceivedServerMessages = 0
-
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      numReceivedServerMessages += 1
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-    (0 until 10).map(i => {
-      val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-      manager.sendMessageReliably(managerServer.id, bufferMessage)
-    }).foreach(f => {
-      try {
-        val g = Await.result(f, 1 second)
-      } catch {
-        case e: Exception => {
-          assert(false)
-        }
-      }
-    })
-    assert(numReceivedServerMessages == 10)
-    assert(numReceivedMessages == 0)
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-  test("Ack error message") {
-    val conf = new SparkConf
-    conf.set("spark.authenticate", "false")
-    val securityManager = new SecurityManager(conf)
-    val manager = new ConnectionManager(0, conf, securityManager)
-    val managerServer = new ConnectionManager(0, conf, securityManager)
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      throw new Exception("Custom exception text")
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer)
-
-    val future = manager.sendMessageReliably(managerServer.id, bufferMessage)
-
-    val exception = intercept[IOException] {
-      Await.result(future, 1 second)
-    }
-    assert(Utils.exceptionString(exception).contains("Custom exception text"))
-
-    manager.stop()
-    managerServer.stop()
-
-  }
-
-  test("sendMessageReliably timeout") {
-    val clientConf = new SparkConf
-    clientConf.set("spark.authenticate", "false")
-    val ackTimeoutS = 30
-    clientConf.set("spark.core.connection.ack.wait.timeout", s"${ackTimeoutS}s")
-
-    val clientSecurityManager = new SecurityManager(clientConf)
-    val manager = new ConnectionManager(0, clientConf, clientSecurityManager)
-
-    val serverConf = new SparkConf
-    serverConf.set("spark.authenticate", "false")
-    val serverSecurityManager = new SecurityManager(serverConf)
-    val managerServer = new ConnectionManager(0, serverConf, serverSecurityManager)
-    managerServer.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
-      // sleep 60 sec > ack timeout for simulating server slow down or hang up
-      Thread.sleep(ackTimeoutS * 3 * 1000)
-      None
-    })
-
-    val size = 10 * 1024 * 1024
-    val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
-    buffer.flip
-    val bufferMessage = Message.createBufferMessage(buffer.duplicate)
-
-    val future = manager.sendMessageReliably(managerServer.id, bufferMessage)
-
-    // Future should throw IOException in 30 sec.
-    // Otherwise TimeoutExcepton is thrown from Await.result.
-    // We expect TimeoutException is not thrown.
-    intercept[IOException] {
-      Await.result(future, (ackTimeoutS * 2) second)
-    }
-
-    manager.stop()
-    managerServer.stop()
-  }
-
-}
-
diff --git a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
index c16930e7d6491..e281e817e493d 100644
--- a/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/MapPartitionsWithPreparationRDDSuite.scala
@@ -46,11 +46,17 @@ class MapPartitionsWithPreparationRDDSuite extends SparkFunSuite with LocalSpark
     }
 
     // Verify that the numbers are pushed in the order expected
-    val result = {
-      new MapPartitionsWithPreparationRDD[Int, Int, Unit](
-        parent, preparePartition, executePartition).collect()
-    }
+    val rdd = new MapPartitionsWithPreparationRDD[Int, Int, Unit](
+      parent, preparePartition, executePartition)
+    val result = rdd.collect()
     assert(result === Array(10, 20, 30))
+
+    TestObject.things.clear()
+    // Zip two of these RDDs, both should be prepared before the parent is executed
+    val rdd2 = new MapPartitionsWithPreparationRDD[Int, Int, Unit](
+      parent, preparePartition, executePartition)
+    val result2 = rdd.zipPartitions(rdd2)((a, b) => a).collect()
+    assert(result2 === Array(10, 10, 20, 30, 20, 30))
   }
 
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
index f65349e3e3585..16a92f54f9368 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDOperationScopeSuite.scala
@@ -38,6 +38,13 @@ class RDDOperationScopeSuite extends SparkFunSuite with BeforeAndAfter {
     sc.stop()
   }
 
+  test("equals and hashCode") {
+    val opScope1 = new RDDOperationScope("scope1", id = "1")
+    val opScope2 = new RDDOperationScope("scope1", id = "1")
+    assert(opScope1 === opScope2)
+    assert(opScope1.hashCode() === opScope2.hashCode())
+  }
+
   test("getAllScopes") {
     assert(scope1.getAllScopes === Seq(scope1))
     assert(scope2.getAllScopes === Seq(scope1, scope2))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
new file mode 100644
index 0000000000000..e0f474aa505c1
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/AdaptiveSchedulingSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.spark._
+
+object AdaptiveSchedulingSuiteState {
+  var tasksRun = 0
+
+  def clear(): Unit = {
+    tasksRun = 0
+  }
+}
+
+class AdaptiveSchedulingSuite extends SparkFunSuite with LocalSparkContext {
+  test("simple use of submitMapStage") {
+    try {
+      sc = new SparkContext("local", "test")
+      val rdd = sc.parallelize(1 to 3, 3).map { x =>
+        AdaptiveSchedulingSuiteState.tasksRun += 1
+        (x, x)
+      }
+      val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(2))
+      val shuffled = new CustomShuffledRDD[Int, Int, Int](dep)
+      sc.submitMapStage(dep).get()
+      assert(AdaptiveSchedulingSuiteState.tasksRun == 3)
+      assert(shuffled.collect().toSet == Set((1, 1), (2, 2), (3, 3)))
+      assert(AdaptiveSchedulingSuiteState.tasksRun == 3)
+    } finally {
+      AdaptiveSchedulingSuiteState.clear()
+    }
+  }
+
+  test("fetching multiple map output partitions per reduce") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x))
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(3))
+    val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0, 2))
+    assert(shuffled.partitions.length === 2)
+    assert(shuffled.glom().map(_.toSet).collect().toSet == Set(Set((0, 0), (1, 1)), Set((2, 2))))
+  }
+
+  test("fetching all map output partitions in one reduce") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x))
+    // Also create lots of hash partitions so that some of them are empty
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(5))
+    val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0))
+    assert(shuffled.partitions.length === 1)
+    assert(shuffled.collect().toSet == Set((0, 0), (1, 1), (2, 2)))
+  }
+
+  test("more reduce tasks than map output partitions") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(0 to 2, 3).map(x => (x, x))
+    val dep = new ShuffleDependency[Int, Int, Int](rdd, new HashPartitioner(3))
+    val shuffled = new CustomShuffledRDD[Int, Int, Int](dep, Array(0, 0, 0, 1, 1, 1, 2))
+    assert(shuffled.partitions.length === 7)
+    assert(shuffled.collect().toSet == Set((0, 0), (1, 1), (2, 2)))
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala
new file mode 100644
index 0000000000000..d8d818ceed45f
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.util.Arrays
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+
+/**
+ * A Partitioner that might group together one or more partitions from the parent.
+ *
+ * @param parent a parent partitioner
+ * @param partitionStartIndices indices of partitions in parent that should create new partitions
+ *   in child (this should be an array of increasing partition IDs). For example, if we have a
+ *   parent with 5 partitions, and partitionStartIndices is [0, 2, 4], we get three output
+ *   partitions, corresponding to partition ranges [0, 1], [2, 3] and [4] of the parent partitioner.
+ */
+class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: Array[Int])
+  extends Partitioner {
+
+  @transient private lazy val parentPartitionMapping: Array[Int] = {
+    val n = parent.numPartitions
+    val result = new Array[Int](n)
+    for (i <- 0 until partitionStartIndices.length) {
+      val start = partitionStartIndices(i)
+      val end = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n
+      for (j <- start until end) {
+        result(j) = i
+      }
+    }
+    result
+  }
+
+  override def numPartitions: Int = partitionStartIndices.size
+
+  override def getPartition(key: Any): Int = {
+    parentPartitionMapping(parent.getPartition(key))
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case c: CoalescedPartitioner =>
+      c.parent == parent && Arrays.equals(c.partitionStartIndices, partitionStartIndices)
+    case _ =>
+      false
+  }
+}
+
+private[spark] class CustomShuffledRDDPartition(
+    val index: Int, val startIndexInParent: Int, val endIndexInParent: Int)
+  extends Partition {
+
+  override def hashCode(): Int = index
+}
+
+/**
+ * A special ShuffledRDD that supports a ShuffleDependency object from outside and launching reduce
+ * tasks that read multiple map output partitions.
+ */
+class CustomShuffledRDD[K, V, C](
+    var dependency: ShuffleDependency[K, V, C],
+    partitionStartIndices: Array[Int])
+  extends RDD[(K, C)](dependency.rdd.context, Seq(dependency)) {
+
+  def this(dep: ShuffleDependency[K, V, C]) = {
+    this(dep, (0 until dep.partitioner.numPartitions).toArray)
+  }
+
+  override def getDependencies: Seq[Dependency[_]] = List(dependency)
+
+  override val partitioner = {
+    Some(new CoalescedPartitioner(dependency.partitioner, partitionStartIndices))
+  }
+
+  override def getPartitions: Array[Partition] = {
+    val n = dependency.partitioner.numPartitions
+    Array.tabulate[Partition](partitionStartIndices.length) { i =>
+      val startIndex = partitionStartIndices(i)
+      val endIndex = if (i < partitionStartIndices.length - 1) partitionStartIndices(i + 1) else n
+      new CustomShuffledRDDPartition(i, startIndex, endIndex)
+    }
+  }
+
+  override def compute(p: Partition, context: TaskContext): Iterator[(K, C)] = {
+    val part = p.asInstanceOf[CustomShuffledRDDPartition]
+    SparkEnv.get.shuffleManager.getReader(
+      dependency.shuffleHandle, part.startIndexInParent, part.endIndexInParent, context)
+      .read()
+      .asInstanceOf[Iterator[(K, C)]]
+  }
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    dependency = null
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 2e8688cf41d99..697c195e4ad1f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -26,11 +26,11 @@ import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
+import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster}
 import org.apache.spark.util.CallSite
-import org.apache.spark.executor.TaskMetrics
 
 class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)
   extends DAGSchedulerEventProcessLoop(dagScheduler) {
@@ -49,19 +49,39 @@ class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)
  * An RDD for passing to DAGScheduler. These RDDs will use the dependencies and
  * preferredLocations (if any) that are passed to them. They are deliberately not executable
  * so we can test that DAGScheduler does not try to execute RDDs locally.
+ *
+ * Optionally, one can pass in a list of locations to use as preferred locations for each task,
+ * and a MapOutputTrackerMaster to enable reduce task locality. We pass the tracker separately
+ * because, in this test suite, it won't be the same as sc.env.mapOutputTracker.
  */
 class MyRDD(
     sc: SparkContext,
     numPartitions: Int,
     dependencies: List[Dependency[_]],
-    locations: Seq[Seq[String]] = Nil) extends RDD[(Int, Int)](sc, dependencies) with Serializable {
+    locations: Seq[Seq[String]] = Nil,
+    @transient tracker: MapOutputTrackerMaster = null)
+  extends RDD[(Int, Int)](sc, dependencies) with Serializable {
+
   override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
     throw new RuntimeException("should not be reached")
+
   override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition {
     override def index: Int = i
   }).toArray
-  override def getPreferredLocations(split: Partition): Seq[String] =
-    if (locations.isDefinedAt(split.index)) locations(split.index) else Nil
+
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    if (locations.isDefinedAt(partition.index)) {
+      locations(partition.index)
+    } else if (tracker != null && dependencies.size == 1 &&
+        dependencies(0).isInstanceOf[ShuffleDependency[_, _, _]]) {
+      // If we have only one shuffle dependency, use the same code path as ShuffledRDD for locality
+      val dep = dependencies(0).asInstanceOf[ShuffleDependency[_, _, _]]
+      tracker.getPreferredLocationsForShuffle(dep, partition.index)
+    } else {
+      Nil
+    }
+  }
+
   override def toString: String = "DAGSchedulerSuiteRDD " + id
 }
 
@@ -152,6 +172,14 @@ class DAGSchedulerSuite
     override def jobFailed(exception: Exception) = { failure = exception }
   }
 
+  /** A simple helper class for creating custom JobListeners */
+  class SimpleListener extends JobListener {
+    val results = new HashMap[Int, Any]
+    var failure: Exception = null
+    override def taskSucceeded(index: Int, result: Any): Unit = results.put(index, result)
+    override def jobFailed(exception: Exception): Unit = { failure = exception }
+  }
+
   before {
     sc = new SparkContext("local", "DAGSchedulerSuite")
     sparkListener.submittedStageInfos.clear()
@@ -229,7 +257,7 @@ class DAGSchedulerSuite
     }
   }
 
-  /** Sends the rdd to the scheduler for scheduling and returns the job id. */
+  /** Submits a job to the scheduler and returns the job id. */
   private def submit(
       rdd: RDD[_],
       partitions: Array[Int],
@@ -240,6 +268,15 @@ class DAGSchedulerSuite
     jobId
   }
 
+  /** Submits a map stage to the scheduler and returns the job id. */
+  private def submitMapStage(
+      shuffleDep: ShuffleDependency[_, _, _],
+      listener: JobListener = jobListener): Int = {
+    val jobId = scheduler.nextJobId.getAndIncrement()
+    runEvent(MapStageSubmitted(jobId, shuffleDep, CallSite("", ""), listener))
+    jobId
+  }
+
   /** Sends TaskSetFailed to the scheduler. */
   private def failed(taskSet: TaskSet, message: String) {
     runEvent(TaskSetFailed(taskSet, message, None))
@@ -285,6 +322,15 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+  test("equals and hashCode AccumulableInfo") {
+    val accInfo1 = new AccumulableInfo(1, " Accumulable " + 1, Some("delta" + 1), "val" + 1, true)
+    val accInfo2 = new AccumulableInfo(1, " Accumulable " + 1, Some("delta" + 1), "val" + 1, false)
+    val accInfo3 = new AccumulableInfo(1, " Accumulable " + 1, Some("delta" + 1), "val" + 1, false)
+    assert(accInfo1 !== accInfo2)
+    assert(accInfo2 === accInfo3)
+    assert(accInfo2.hashCode() === accInfo3.hashCode())
+  }
+
   test("cache location preferences w/ dependency") {
     val baseRdd = new MyRDD(sc, 1, Nil).cache()
     val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd)))
@@ -325,7 +371,8 @@ class DAGSchedulerSuite
    */
   test("getMissingParentStages should consider all ancestor RDDs' cache statuses") {
     val rddA = new MyRDD(sc, 1, Nil)
-    val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, null)))
+    val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, new HashPartitioner(1))),
+      tracker = mapOutputTracker)
     val rddC = new MyRDD(sc, 1, List(new OneToOneDependency(rddB))).cache()
     val rddD = new MyRDD(sc, 1, List(new OneToOneDependency(rddC)))
     cacheLocations(rddC.id -> 0) =
@@ -432,9 +479,9 @@ class DAGSchedulerSuite
 
   test("run trivial shuffle") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1)),
@@ -448,13 +495,13 @@ class DAGSchedulerSuite
 
   test("run trivial shuffle with fetch failure") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
-        (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
-        (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
+        (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
+        (Success, makeMapStatus("hostB", reduceRdd.partitions.length))))
     // the 2nd ResultTask failed
     complete(taskSets(1), Seq(
         (Success, 42),
@@ -464,7 +511,7 @@ class DAGSchedulerSuite
     // ask the scheduler to try it again
     scheduler.resubmitFailedStages()
     // have the 2nd attempt pass
-    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.size))))
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.length))))
     // we can see both result blocks now
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
       HashSet("hostA", "hostB"))
@@ -473,15 +520,289 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+
+  // Helper function to validate state when creating tests for task failures
+  private def checkStageId(stageId: Int, attempt: Int, stageAttempt: TaskSet) {
+    assert(stageAttempt.stageId === stageId)
+    assert(stageAttempt.stageAttemptId == attempt)
+  }
+
+
+  // Helper functions to extract commonly used code in Fetch Failure test cases
+  private def setupStageAbortTest(sc: SparkContext) {
+    sc.listenerBus.addListener(new EndListener())
+    ended = false
+    jobResult = null
+  }
+
+  // Create a new Listener to confirm that the listenerBus sees the JobEnd message
+  // when we abort the stage. This message will also be consumed by the EventLoggingListener
+  // so this will propagate up to the user.
+  var ended = false
+  var jobResult : JobResult = null
+
+  class EndListener extends SparkListener {
+    override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
+      jobResult = jobEnd.jobResult
+      ended = true
+    }
+  }
+
+  /**
+   * Common code to get the next stage attempt, confirm it's the one we expect, and complete it
+   * successfully.
+   *
+   * @param stageId - The current stageId
+   * @param attemptIdx - The current attempt count
+   * @param numShufflePartitions - The number of partitions in the next stage
+   */
+  private def completeShuffleMapStageSuccessfully(
+      stageId: Int,
+      attemptIdx: Int,
+      numShufflePartitions: Int): Unit = {
+    val stageAttempt = taskSets.last
+    checkStageId(stageId, attemptIdx, stageAttempt)
+    complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map {
+      case (task, idx) =>
+        (Success, makeMapStatus("host" + ('A' + idx).toChar, numShufflePartitions))
+    }.toSeq)
+  }
+
+  /**
+   * Common code to get the next stage attempt, confirm it's the one we expect, and complete it
+   * with all FetchFailure.
+   *
+   * @param stageId - The current stageId
+   * @param attemptIdx - The current attempt count
+   * @param shuffleDep - The shuffle dependency of the stage with a fetch failure
+   */
+  private def completeNextStageWithFetchFailure(
+      stageId: Int,
+      attemptIdx: Int,
+      shuffleDep: ShuffleDependency[_, _, _]): Unit = {
+    val stageAttempt = taskSets.last
+    checkStageId(stageId, attemptIdx, stageAttempt)
+    complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map { case (task, idx) =>
+      (FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0, idx, "ignored"), null)
+    }.toSeq)
+  }
+
+  /**
+   * Common code to get the next result stage attempt, confirm it's the one we expect, and
+   * complete it with a success where we return 42.
+   *
+   * @param stageId - The current stageId
+   * @param attemptIdx - The current attempt count
+   */
+  private def completeNextResultStageWithSuccess(stageId: Int, attemptIdx: Int): Unit = {
+    val stageAttempt = taskSets.last
+    checkStageId(stageId, attemptIdx, stageAttempt)
+    assert(scheduler.stageIdToStage(stageId).isInstanceOf[ResultStage])
+    complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map(_ => (Success, 42)).toSeq)
+  }
+
+  /**
+   * In this test, we simulate a job where many tasks in the same stage fail. We want to show
+   * that many fetch failures inside a single stage attempt do not trigger an abort
+   * on their own, but only when there are enough failing stage attempts.
+   */
+  test("Single stage fetch failure should not abort the stage.") {
+    setupStageAbortTest(sc)
+
+    val parts = 8
+    val shuffleMapRdd = new MyRDD(sc, parts, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(parts))
+    val reduceRdd = new MyRDD(sc, parts, List(shuffleDep), tracker = mapOutputTracker)
+    submit(reduceRdd, (0 until parts).toArray)
+
+    completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = parts)
+
+    completeNextStageWithFetchFailure(1, 0, shuffleDep)
+
+    // Resubmit and confirm that now all is well
+    scheduler.resubmitFailedStages()
+
+    assert(scheduler.runningStages.nonEmpty)
+    assert(!ended)
+
+    // Complete stage 0 and then stage 1 with a "42"
+    completeShuffleMapStageSuccessfully(0, 1, numShufflePartitions = parts)
+    completeNextResultStageWithSuccess(1, 1)
+
+    // Confirm job finished succesfully
+    sc.listenerBus.waitUntilEmpty(1000)
+    assert(ended === true)
+    assert(results === (0 until parts).map { idx => idx -> 42 }.toMap)
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test we simulate a job failure where the first stage completes successfully and
+   * the second stage fails due to a fetch failure. Multiple successive fetch failures of a stage
+   * trigger an overall job abort to avoid endless retries.
+   */
+  test("Multiple consecutive stage fetch failures should lead to job being aborted.") {
+    setupStageAbortTest(sc)
+
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
+    submit(reduceRdd, Array(0, 1))
+
+    for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) {
+      // Complete all the tasks for the current attempt of stage 0 successfully
+      completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2)
+
+      // Now we should have a new taskSet, for a new attempt of stage 1.
+      // Fail all these tasks with FetchFailure
+      completeNextStageWithFetchFailure(1, attempt, shuffleDep)
+
+      // this will trigger a resubmission of stage 0, since we've lost some of its
+      // map output, for the next iteration through the loop
+      scheduler.resubmitFailedStages()
+
+      if (attempt < Stage.MAX_CONSECUTIVE_FETCH_FAILURES - 1) {
+        assert(scheduler.runningStages.nonEmpty)
+        assert(!ended)
+      } else {
+        // Stage should have been aborted and removed from running stages
+        assertDataStructuresEmpty()
+        sc.listenerBus.waitUntilEmpty(1000)
+        assert(ended)
+        jobResult match {
+          case JobFailed(reason) =>
+            assert(reason.getMessage.contains("ResultStage 1 () has failed the maximum"))
+          case other => fail(s"expected JobFailed, not $other")
+        }
+      }
+    }
+  }
+
+  /**
+   * In this test, we create a job with two consecutive shuffles, and simulate 2 failures for each
+   * shuffle fetch. In total In total, the job has had four failures overall but not four failures
+   * for a particular stage, and as such should not be aborted.
+   */
+  test("Failures in different stages should not trigger an overall abort") {
+    setupStageAbortTest(sc)
+
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
+    submit(finalRdd, Array(0))
+
+    // In the first two iterations, Stage 0 succeeds and stage 1 fails. In the next two iterations,
+    // stage 2 fails.
+    for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) {
+      // Complete all the tasks for the current attempt of stage 0 successfully
+      completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2)
+
+      if (attempt < Stage.MAX_CONSECUTIVE_FETCH_FAILURES / 2) {
+        // Now we should have a new taskSet, for a new attempt of stage 1.
+        // Fail all these tasks with FetchFailure
+        completeNextStageWithFetchFailure(1, attempt, shuffleDepOne)
+      } else {
+        completeShuffleMapStageSuccessfully(1, attempt, numShufflePartitions = 1)
+
+        // Fail stage 2
+        completeNextStageWithFetchFailure(2, attempt - Stage.MAX_CONSECUTIVE_FETCH_FAILURES / 2,
+          shuffleDepTwo)
+      }
+
+      // this will trigger a resubmission of stage 0, since we've lost some of its
+      // map output, for the next iteration through the loop
+      scheduler.resubmitFailedStages()
+    }
+
+    completeShuffleMapStageSuccessfully(0, 4, numShufflePartitions = 2)
+    completeShuffleMapStageSuccessfully(1, 4, numShufflePartitions = 1)
+
+    // Succeed stage2 with a "42"
+    completeNextResultStageWithSuccess(2, Stage.MAX_CONSECUTIVE_FETCH_FAILURES/2)
+
+    assert(results === Map(0 -> 42))
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test we demonstrate that only consecutive failures trigger a stage abort. A stage may
+   * fail multiple times, succeed, then fail a few more times (because its run again by downstream
+   * dependencies). The total number of failed attempts for one stage will go over the limit,
+   * but that doesn't matter, since they have successes in the middle.
+   */
+  test("Non-consecutive stage failures don't trigger abort") {
+    setupStageAbortTest(sc)
+
+    val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
+    submit(finalRdd, Array(0))
+
+    // First, execute stages 0 and 1, failing stage 1 up to MAX-1 times.
+    for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES - 1) {
+      // Make each task in stage 0 success
+      completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2)
+
+      // Now we should have a new taskSet, for a new attempt of stage 1.
+      // Fail these tasks with FetchFailure
+      completeNextStageWithFetchFailure(1, attempt, shuffleDepOne)
+
+      scheduler.resubmitFailedStages()
+
+      // Confirm we have not yet aborted
+      assert(scheduler.runningStages.nonEmpty)
+      assert(!ended)
+    }
+
+    // Rerun stage 0 and 1 to step through the task set
+    completeShuffleMapStageSuccessfully(0, 3, numShufflePartitions = 2)
+    completeShuffleMapStageSuccessfully(1, 3, numShufflePartitions = 1)
+
+    // Fail stage 2 so that stage 1 is resubmitted when we call scheduler.resubmitFailedStages()
+    completeNextStageWithFetchFailure(2, 0, shuffleDepTwo)
+
+    scheduler.resubmitFailedStages()
+
+    // Rerun stage 0 to step through the task set
+    completeShuffleMapStageSuccessfully(0, 4, numShufflePartitions = 2)
+
+    // Now again, fail stage 1 (up to MAX_FAILURES) but confirm that this doesn't trigger an abort
+    // since we succeeded in between.
+    completeNextStageWithFetchFailure(1, 4, shuffleDepOne)
+
+    scheduler.resubmitFailedStages()
+
+    // Confirm we have not yet aborted
+    assert(scheduler.runningStages.nonEmpty)
+    assert(!ended)
+
+    // Next, succeed all and confirm output
+    // Rerun stage 0 + 1
+    completeShuffleMapStageSuccessfully(0, 5, numShufflePartitions = 2)
+    completeShuffleMapStageSuccessfully(1, 5, numShufflePartitions = 1)
+
+    // Succeed stage 2 and verify results
+    completeNextResultStageWithSuccess(2, 1)
+
+    assertDataStructuresEmpty()
+    sc.listenerBus.waitUntilEmpty(1000)
+    assert(ended === true)
+    assert(results === Map(0 -> 42))
+  }
+
   test("trivial shuffle with multiple fetch failures") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
     complete(taskSets(0), Seq(
-      (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
-      (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
+      (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length))))
     // The MapOutputTracker should know about both map output locations.
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
       HashSet("hostA", "hostB"))
@@ -516,9 +837,9 @@ class DAGSchedulerSuite
    */
   test("late fetch failures don't cause multiple concurrent attempts for the same map stage") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     val mapStageId = 0
@@ -527,6 +848,7 @@ class DAGSchedulerSuite
     }
 
     // The map stage should have been submitted.
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(countSubmittedMapStageAttempts() === 1)
 
     complete(taskSets(0), Seq(
@@ -583,9 +905,9 @@ class DAGSchedulerSuite
   test("extremely late fetch failures don't cause multiple concurrent attempts for " +
       "the same stage") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     def countSubmittedReduceStageAttempts(): Int = {
@@ -646,31 +968,68 @@ class DAGSchedulerSuite
 
   test("ignore late map task completions") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
+
     // pretend we were told hostA went away
     val oldEpoch = mapOutputTracker.getEpoch
     runEvent(ExecutorLost("exec-hostA"))
     val newEpoch = mapOutputTracker.getEpoch
     assert(newEpoch > oldEpoch)
+
+    // now start completing some tasks in the shuffle map stage, under different hosts
+    // and epochs, and make sure scheduler updates its state correctly
     val taskSet = taskSets(0)
+    val shuffleStage = scheduler.stageIdToStage(taskSet.stageId).asInstanceOf[ShuffleMapStage]
+    assert(shuffleStage.numAvailableOutputs === 0)
+
     // should be ignored for being too old
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
-    // should work because it's a non-failed host
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostB",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSet.tasks(0),
+      Success,
+      makeMapStatus("hostA", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 0)
+
+    // should work because it's a non-failed host (so the available map outputs will increase)
+    runEvent(CompletionEvent(
+      taskSet.tasks(0),
+      Success,
+      makeMapStatus("hostB", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 1)
+
     // should be ignored for being too old
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
-    // should work because it's a new epoch
+    runEvent(CompletionEvent(
+      taskSet.tasks(0),
+      Success,
+      makeMapStatus("hostA", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 1)
+
+    // should work because it's a new epoch, which will increase the number of available map
+    // outputs, and also finish the stage
     taskSet.tasks(1).epoch = newEpoch
-    runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA",
-      reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSet.tasks(1),
+      Success,
+      makeMapStatus("hostA", reduceRdd.partitions.size),
+      null,
+      createFakeTaskInfo(),
+      null))
+    assert(shuffleStage.numAvailableOutputs === 2)
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
            HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
+
+    // finish the next stage normally, which completes the job
     complete(taskSets(1), Seq((Success, 42), (Success, 43)))
     assert(results === Map(0 -> 42, 1 -> 43))
     assertDataStructuresEmpty()
@@ -678,8 +1037,8 @@ class DAGSchedulerSuite
 
   test("run shuffle with map stage failure") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
-    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0, 1))
 
     // Fail the map stage.  This should cause the entire job to fail.
@@ -695,6 +1054,173 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
+  /**
+   * This test runs a three stage job, with a fetch failure in stage 1.  but during the retry, we
+   * have completions from both the first & second attempt of stage 1.  So all the map output is
+   * available before we finish any task set for stage 1.  We want to make sure that we don't
+   * submit stage 2 until the map output for stage 1 is registered
+   */
+  test("don't submit stage until its dependencies map outputs are registered (SPARK-5259)") {
+    val firstRDD = new MyRDD(sc, 3, Nil)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, null)
+    val firstShuffleId = firstShuffleDep.shuffleId
+    val shuffleMapRdd = new MyRDD(sc, 3, List(firstShuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    submit(reduceRdd, Array(0))
+
+    // things start out smoothly, stage 0 completes with no issues
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostB", shuffleMapRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", shuffleMapRdd.partitions.length)),
+      (Success, makeMapStatus("hostA", shuffleMapRdd.partitions.length))
+    ))
+
+    // then one executor dies, and a task fails in stage 1
+    runEvent(ExecutorLost("exec-hostA"))
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(0),
+      FetchFailed(null, firstShuffleId, 2, 0, "Fetch failed"),
+      null,
+      null,
+      createFakeTaskInfo(),
+      null))
+
+    // so we resubmit stage 0, which completes happily
+    scheduler.resubmitFailedStages()
+    val stage0Resubmit = taskSets(2)
+    assert(stage0Resubmit.stageId == 0)
+    assert(stage0Resubmit.stageAttemptId === 1)
+    val task = stage0Resubmit.tasks(0)
+    assert(task.partitionId === 2)
+    runEvent(CompletionEvent(
+      task,
+      Success,
+      makeMapStatus("hostC", shuffleMapRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+
+    // now here is where things get tricky : we will now have a task set representing
+    // the second attempt for stage 1, but we *also* have some tasks for the first attempt for
+    // stage 1 still going
+    val stage1Resubmit = taskSets(3)
+    assert(stage1Resubmit.stageId == 1)
+    assert(stage1Resubmit.stageAttemptId === 1)
+    assert(stage1Resubmit.tasks.length === 3)
+
+    // we'll have some tasks finish from the first attempt, and some finish from the second attempt,
+    // so that we actually have all stage outputs, though no attempt has completed all its
+    // tasks
+    runEvent(CompletionEvent(
+      taskSets(3).tasks(0),
+      Success,
+      makeMapStatus("hostC", reduceRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+    runEvent(CompletionEvent(
+      taskSets(3).tasks(1),
+      Success,
+      makeMapStatus("hostC", reduceRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+    // late task finish from the first attempt
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(2),
+      Success,
+      makeMapStatus("hostB", reduceRdd.partitions.length),
+      null,
+      createFakeTaskInfo(),
+      null))
+
+    // What should happen now is that we submit stage 2.  However, we might not see an error
+    // b/c of DAGScheduler's error handling (it tends to swallow errors and just log them).  But
+    // we can check some conditions.
+    // Note that the really important thing here is not so much that we submit stage 2 *immediately*
+    // but that we don't end up with some error from these interleaved completions.  It would also
+    // be OK (though sub-optimal) if stage 2 simply waited until the resubmission of stage 1 had
+    // all its tasks complete
+
+    // check that we have all the map output for stage 0 (it should have been there even before
+    // the last round of completions from stage 1, but just to double check it hasn't been messed
+    // up) and also the newly available stage 1
+    val stageToReduceIdxs = Seq(
+      0 -> (0 until 3),
+      1 -> (0 until 1)
+    )
+    for {
+      (stage, reduceIdxs) <- stageToReduceIdxs
+      reduceIdx <- reduceIdxs
+    } {
+      // this would throw an exception if the map status hadn't been registered
+      val statuses = mapOutputTracker.getMapSizesByExecutorId(stage, reduceIdx)
+      // really we should have already thrown an exception rather than fail either of these
+      // asserts, but just to be extra defensive let's double check the statuses are OK
+      assert(statuses != null)
+      assert(statuses.nonEmpty)
+    }
+
+    // and check that stage 2 has been submitted
+    assert(taskSets.size == 5)
+    val stage2TaskSet = taskSets(4)
+    assert(stage2TaskSet.stageId == 2)
+    assert(stage2TaskSet.stageAttemptId == 0)
+  }
+
+  /**
+   * We lose an executor after completing some shuffle map tasks on it.  Those tasks get
+   * resubmitted, and when they finish the job completes normally
+   */
+  test("register map outputs correctly after ExecutorLost and task Resubmitted") {
+    val firstRDD = new MyRDD(sc, 3, Nil)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, null)
+    val reduceRdd = new MyRDD(sc, 5, List(firstShuffleDep))
+    submit(reduceRdd, Array(0))
+
+    // complete some of the tasks from the first stage, on one host
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(0), Success,
+      makeMapStatus("hostA", reduceRdd.partitions.length), null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(1), Success,
+      makeMapStatus("hostA", reduceRdd.partitions.length), null, createFakeTaskInfo(), null))
+
+    // now that host goes down
+    runEvent(ExecutorLost("exec-hostA"))
+
+    // so we resubmit those tasks
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(0), Resubmitted, null, null, createFakeTaskInfo(), null))
+    runEvent(CompletionEvent(
+      taskSets(0).tasks(1), Resubmitted, null, null, createFakeTaskInfo(), null))
+
+    // now complete everything on a different host
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length))
+    ))
+
+    // now we should submit stage 1, and the map output from stage 0 should be registered
+
+    // check that we have all the map output for stage 0
+    (0 until reduceRdd.partitions.length).foreach { reduceIdx =>
+      val statuses = mapOutputTracker.getMapSizesByExecutorId(0, reduceIdx)
+      // really we should have already thrown an exception rather than fail either of these
+      // asserts, but just to be extra defensive let's double check the statuses are OK
+      assert(statuses != null)
+      assert(statuses.nonEmpty)
+    }
+
+    // and check that stage 1 has been submitted
+    assert(taskSets.size == 2)
+    val stage1TaskSet = taskSets(1)
+    assert(stage1TaskSet.stageId == 1)
+    assert(stage1TaskSet.stageAttemptId == 0)
+  }
+
   /**
    * Makes sure that failures of stage used by multiple jobs are correctly handled.
    *
@@ -714,12 +1240,12 @@ class DAGSchedulerSuite
    */
   test("failure of stage used by two jobs") {
     val shuffleMapRdd1 = new MyRDD(sc, 2, Nil)
-    val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, null)
+    val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2))
     val shuffleMapRdd2 = new MyRDD(sc, 2, Nil)
-    val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, null)
+    val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, new HashPartitioner(2))
 
-    val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1))
-    val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2))
+    val reduceRdd1 = new MyRDD(sc, 2, List(shuffleDep1), tracker = mapOutputTracker)
+    val reduceRdd2 = new MyRDD(sc, 2, List(shuffleDep1, shuffleDep2), tracker = mapOutputTracker)
 
     // We need to make our own listeners for this test, since by default submit uses the same
     // listener for all jobs, and here we want to capture the failure for each job separately.
@@ -751,9 +1277,9 @@ class DAGSchedulerSuite
 
   test("run trivial shuffle with out-of-band failure and retry") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     // blockManagerMaster.removeExecutor("exec-hostA")
     // pretend we were told hostA went away
@@ -774,10 +1300,10 @@ class DAGSchedulerSuite
 
   test("recursive shuffle failures") {
     val shuffleOneRdd = new MyRDD(sc, 2, Nil)
-    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne))
-    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker)
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
     submit(finalRdd, Array(0))
     // have the first stage complete normally
     complete(taskSets(0), Seq(
@@ -803,14 +1329,14 @@ class DAGSchedulerSuite
 
   test("cached post-shuffle") {
     val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache()
-    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
-    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache()
-    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
-    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo))
+    val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, new HashPartitioner(2))
+    val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne), tracker = mapOutputTracker).cache()
+    val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, new HashPartitioner(1))
+    val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo), tracker = mapOutputTracker)
     submit(finalRdd, Array(0))
     cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD"))
     cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC"))
-    // complete stage 2
+    // complete stage 0
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 2)),
         (Success, makeMapStatus("hostB", 2))))
@@ -818,7 +1344,7 @@ class DAGSchedulerSuite
     complete(taskSets(1), Seq(
         (Success, makeMapStatus("hostA", 1)),
         (Success, makeMapStatus("hostB", 1))))
-    // pretend stage 0 failed because hostA went down
+    // pretend stage 2 failed because hostA went down
     complete(taskSets(2), Seq(
         (FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0, "ignored"), null)))
     // TODO assert this:
@@ -912,9 +1438,9 @@ class DAGSchedulerSuite
   test("reduce tasks should be placed locally with map output") {
     // Create an shuffleMapRdd with 1 partition
     val shuffleMapRdd = new MyRDD(sc, 1, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1))))
@@ -933,9 +1459,9 @@ class DAGSchedulerSuite
     val numMapTasks = 4
     // Create an shuffleMapRdd with more partitions
     val shuffleMapRdd = new MyRDD(sc, numMapTasks, Nil)
-    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
 
     val statuses = (1 to numMapTasks).map { i =>
@@ -957,10 +1483,10 @@ class DAGSchedulerSuite
     // Create an RDD that has both a shuffle dependency and a narrow dependency (e.g. for a join)
     val rdd1 = new MyRDD(sc, 1, Nil)
     val rdd2 = new MyRDD(sc, 1, Nil, locations = Seq(Seq("hostB")))
-    val shuffleDep = new ShuffleDependency(rdd1, null)
+    val shuffleDep = new ShuffleDependency(rdd1, new HashPartitioner(1))
     val narrowDep = new OneToOneDependency(rdd2)
     val shuffleId = shuffleDep.shuffleId
-    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep, narrowDep), tracker = mapOutputTracker)
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
       (Success, makeMapStatus("hostA", 1))))
@@ -990,6 +1516,232 @@ class DAGSchedulerSuite
     assert(stackTraceString.contains("org.scalatest.FunSuite"))
   }
 
+  test("simple map stage submission") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
+
+    // Submit a map stage by itself
+    submitMapStage(shuffleDep)
+    assert(results.size === 0)  // No results yet
+    completeShuffleMapStageSuccessfully(0, 0, 1)
+    assert(results.size === 1)
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Submit a reduce job that depends on this map stage; it should directly do the reduce
+    submit(reduceRdd, Array(0))
+    completeNextResultStageWithSuccess(2, 0)
+    assert(results === Map(0 -> 42))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Check that if we submit the map stage again, no tasks run
+    submitMapStage(shuffleDep)
+    assert(results.size === 1)
+    assertDataStructuresEmpty()
+  }
+
+  test("map stage submission with reduce stage also depending on the data") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1))
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep), tracker = mapOutputTracker)
+
+    // Submit the map stage by itself
+    submitMapStage(shuffleDep)
+
+    // Submit a reduce job that depends on this map stage
+    submit(reduceRdd, Array(0))
+
+    // Complete tasks for the map stage
+    completeShuffleMapStageSuccessfully(0, 0, 1)
+    assert(results.size === 1)
+    results.clear()
+
+    // Complete tasks for the reduce stage
+    completeNextResultStageWithSuccess(1, 0)
+    assert(results === Map(0 -> 42))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Check that if we submit the map stage again, no tasks run
+    submitMapStage(shuffleDep)
+    assert(results.size === 1)
+    assertDataStructuresEmpty()
+  }
+
+  test("map stage submission with fetch failure") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
+
+    // Submit a map stage by itself
+    submitMapStage(shuffleDep)
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", reduceRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", reduceRdd.partitions.length))))
+    assert(results.size === 1)
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Submit a reduce job that depends on this map stage, but where one reduce will fail a fetch
+    submit(reduceRdd, Array(0, 1))
+    complete(taskSets(1), Seq(
+      (Success, 42),
+      (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), null)))
+    // Ask the scheduler to try it again; TaskSet 2 will rerun the map task that we couldn't fetch
+    // from, then TaskSet 3 will run the reduce stage
+    scheduler.resubmitFailedStages()
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.length))))
+    complete(taskSets(3), Seq((Success, 43)))
+    assert(results === Map(0 -> 42, 1 -> 43))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Run another reduce job without a failure; this should just work
+    submit(reduceRdd, Array(0, 1))
+    complete(taskSets(4), Seq(
+      (Success, 44),
+      (Success, 45)))
+    assert(results === Map(0 -> 44, 1 -> 45))
+    results.clear()
+    assertDataStructuresEmpty()
+
+    // Resubmit the map stage; this should also just work
+    submitMapStage(shuffleDep)
+    assert(results.size === 1)
+    results.clear()
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test, we have three RDDs with shuffle dependencies, and we submit map stage jobs
+   * that are waiting on each one, as well as a reduce job on the last one. We test that all of
+   * these jobs complete even if there are some fetch failures in both shuffles.
+   */
+  test("map stage submission with multiple shared stages and failures") {
+    val rdd1 = new MyRDD(sc, 2, Nil)
+    val dep1 = new ShuffleDependency(rdd1, new HashPartitioner(2))
+    val rdd2 = new MyRDD(sc, 2, List(dep1), tracker = mapOutputTracker)
+    val dep2 = new ShuffleDependency(rdd2, new HashPartitioner(2))
+    val rdd3 = new MyRDD(sc, 2, List(dep2), tracker = mapOutputTracker)
+
+    val listener1 = new SimpleListener
+    val listener2 = new SimpleListener
+    val listener3 = new SimpleListener
+
+    submitMapStage(dep1, listener1)
+    submitMapStage(dep2, listener2)
+    submit(rdd3, Array(0, 1), listener = listener3)
+
+    // Complete the first stage
+    assert(taskSets(0).stageId === 0)
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", rdd1.partitions.length)),
+      (Success, makeMapStatus("hostB", rdd1.partitions.length))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
+    assert(listener1.results.size === 1)
+
+    // When attempting the second stage, show a fetch failure
+    assert(taskSets(1).stageId === 1)
+    complete(taskSets(1), Seq(
+      (Success, makeMapStatus("hostA", rdd2.partitions.length)),
+      (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0, 0, "ignored"), null)))
+    scheduler.resubmitFailedStages()
+    assert(listener2.results.size === 0)    // Second stage listener should not have a result yet
+
+    // Stage 0 should now be running as task set 2; make its task succeed
+    assert(taskSets(2).stageId === 0)
+    complete(taskSets(2), Seq(
+      (Success, makeMapStatus("hostC", rdd2.partitions.length))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep1.shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
+    assert(listener2.results.size === 0)    // Second stage listener should still not have a result
+
+    // Stage 1 should now be running as task set 3; make its first task succeed
+    assert(taskSets(3).stageId === 1)
+    complete(taskSets(3), Seq(
+      (Success, makeMapStatus("hostB", rdd2.partitions.length)),
+      (Success, makeMapStatus("hostD", rdd2.partitions.length))))
+    assert(mapOutputTracker.getMapSizesByExecutorId(dep2.shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostD")))
+    assert(listener2.results.size === 1)
+
+    // Finally, the reduce job should be running as task set 4; make it see a fetch failure,
+    // then make it run again and succeed
+    assert(taskSets(4).stageId === 2)
+    complete(taskSets(4), Seq(
+      (Success, 52),
+      (FetchFailed(makeBlockManagerId("hostD"), dep2.shuffleId, 0, 0, "ignored"), null)))
+    scheduler.resubmitFailedStages()
+
+    // TaskSet 5 will rerun stage 1's lost task, then TaskSet 6 will rerun stage 2
+    assert(taskSets(5).stageId === 1)
+    complete(taskSets(5), Seq(
+      (Success, makeMapStatus("hostE", rdd2.partitions.length))))
+    complete(taskSets(6), Seq(
+      (Success, 53)))
+    assert(listener3.results === Map(0 -> 52, 1 -> 53))
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * In this test, we run a map stage where one of the executors fails but we still receive a
+   * "zombie" complete message from that executor. We want to make sure the stage is not reported
+   * as done until all tasks have completed.
+   */
+  test("map stage submission with executor failure late map task completions") {
+    val shuffleMapRdd = new MyRDD(sc, 3, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2))
+
+    submitMapStage(shuffleDep)
+
+    val oldTaskSet = taskSets(0)
+    runEvent(CompletionEvent(oldTaskSet.tasks(0), Success, makeMapStatus("hostA", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+
+    // Pretend host A was lost
+    val oldEpoch = mapOutputTracker.getEpoch
+    runEvent(ExecutorLost("exec-hostA"))
+    val newEpoch = mapOutputTracker.getEpoch
+    assert(newEpoch > oldEpoch)
+
+    // Suppose we also get a completed event from task 1 on the same host; this should be ignored
+    runEvent(CompletionEvent(oldTaskSet.tasks(1), Success, makeMapStatus("hostA", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+
+    // A completion from another task should work because it's a non-failed host
+    runEvent(CompletionEvent(oldTaskSet.tasks(2), Success, makeMapStatus("hostB", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+
+    // Now complete tasks in the second task set
+    val newTaskSet = taskSets(1)
+    assert(newTaskSet.tasks.size === 2)     // Both tasks 0 and 1 were on on hostA
+    runEvent(CompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 0)    // Map stage job should not be complete yet
+    runEvent(CompletionEvent(newTaskSet.tasks(1), Success, makeMapStatus("hostB", 2),
+      null, createFakeTaskInfo(), null))
+    assert(results.size === 1)    // Map stage job should now finally be complete
+    assertDataStructuresEmpty()
+
+    // Also test that a reduce stage using this shuffled data can immediately run
+    val reduceRDD = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker)
+    results.clear()
+    submit(reduceRDD, Array(0, 1))
+    complete(taskSets(2), Seq((Success, 42), (Success, 43)))
+    assert(results === Map(0 -> 42, 1 -> 43))
+    results.clear()
+    assertDataStructuresEmpty()
+  }
+
   /**
    * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
    * Note that this checks only the host and not the executor ID.
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
new file mode 100644
index 0000000000000..1ae5b030f0832
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.hadoop.mapred.{FileOutputCommitter, TaskAttemptContext}
+import org.scalatest.concurrent.Timeouts
+import org.scalatest.time.{Span, Seconds}
+
+import org.apache.spark.{SparkConf, SparkContext, LocalSparkContext, SparkFunSuite, TaskContext}
+import org.apache.spark.util.Utils
+
+/**
+ * Integration tests for the OutputCommitCoordinator.
+ *
+ * See also: [[OutputCommitCoordinatorSuite]] for unit tests that use mocks.
+ */
+class OutputCommitCoordinatorIntegrationSuite
+  extends SparkFunSuite
+  with LocalSparkContext
+  with Timeouts {
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val conf = new SparkConf()
+      .set("master", "local[2,4]")
+      .set("spark.speculation", "true")
+      .set("spark.hadoop.mapred.output.committer.class",
+        classOf[ThrowExceptionOnFirstAttemptOutputCommitter].getCanonicalName)
+    sc = new SparkContext("local[2, 4]", "test", conf)
+  }
+
+  test("exception thrown in OutputCommitter.commitTask()") {
+    // Regression test for SPARK-10381
+    failAfter(Span(60, Seconds)) {
+      val tempDir = Utils.createTempDir()
+      try {
+        sc.parallelize(1 to 4, 2).map(_.toString).saveAsTextFile(tempDir.getAbsolutePath + "/out")
+      } finally {
+        Utils.deleteRecursively(tempDir)
+      }
+    }
+  }
+}
+
+private class ThrowExceptionOnFirstAttemptOutputCommitter extends FileOutputCommitter {
+  override def commitTask(context: TaskAttemptContext): Unit = {
+    val ctx = TaskContext.get()
+    if (ctx.attemptNumber < 1) {
+      throw new java.io.FileNotFoundException("Intentional exception")
+    }
+    super.commitTask(context)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index e5ecd4b7c2610..6d08d7c5b7d2a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -63,6 +63,9 @@ import scala.language.postfixOps
  * was not in SparkHadoopWriter, the tests would still pass because only one of the
  * increments would be captured even though the commit in both tasks was executed
  * erroneously.
+ *
+ * See also: [[OutputCommitCoordinatorIntegrationSuite]] for integration tests that do
+ * not use mocks.
  */
 class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
 
@@ -164,27 +167,28 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Only authorized committer failures can clear the authorized committer lock (SPARK-6614)") {
     val stage: Int = 1
-    val partition: Long = 2
-    val authorizedCommitter: Long = 3
-    val nonAuthorizedCommitter: Long = 100
+    val partition: Int = 2
+    val authorizedCommitter: Int = 3
+    val nonAuthorizedCommitter: Int = 100
     outputCommitCoordinator.stageStart(stage)
-    assert(outputCommitCoordinator.canCommit(stage, partition, attempt = authorizedCommitter))
-    assert(!outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter))
+
+    assert(outputCommitCoordinator.canCommit(stage, partition, authorizedCommitter))
+    assert(!outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter))
     // The non-authorized committer fails
     outputCommitCoordinator.taskCompleted(
-      stage, partition, attempt = nonAuthorizedCommitter, reason = TaskKilled)
+      stage, partition, attemptNumber = nonAuthorizedCommitter, reason = TaskKilled)
     // New tasks should still not be able to commit because the authorized committer has not failed
     assert(
-      !outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter + 1))
+      !outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 1))
     // The authorized committer now fails, clearing the lock
     outputCommitCoordinator.taskCompleted(
-      stage, partition, attempt = authorizedCommitter, reason = TaskKilled)
+      stage, partition, attemptNumber = authorizedCommitter, reason = TaskKilled)
     // A new task should now be allowed to become the authorized committer
     assert(
-      outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter + 2))
+      outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 2))
     // There can only be one authorized committer
     assert(
-      !outputCommitCoordinator.canCommit(stage, partition, attempt = nonAuthorizedCommitter + 3))
+      !outputCommitCoordinator.canCommit(stage, partition, nonAuthorizedCommitter + 3))
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index d1e23ed527ff1..9fa8859382911 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -43,7 +43,7 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
 
     // This test will check if the number of executors received by "SparkListener" is same as the
     // number of all executors, so we need to wait until all executors are up
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 10000)
+    sc.jobProgressListener.waitUntilExecutorsUp(2, 60000)
 
     val rdd1 = sc.parallelize(1 to 100, 4)
     val rdd2 = rdd1.map(_.toString)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index edbdb485c5ea4..f0eadf240943e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -334,7 +334,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
 
     // Now mark host2 as dead
     sched.removeExecutor("exec2")
-    manager.executorLost("exec2", "host2")
+    manager.executorLost("exec2", "host2", SlaveLost())
 
     // nothing should be chosen
     assert(manager.resourceOffer("exec1", "host1", ANY) === None)
@@ -504,13 +504,36 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
       Array(PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY)))
     // test if the valid locality is recomputed when the executor is lost
     sched.removeExecutor("execC")
-    manager.executorLost("execC", "host2")
+    manager.executorLost("execC", "host2", SlaveLost())
     assert(manager.myLocalityLevels.sameElements(Array(NODE_LOCAL, NO_PREF, ANY)))
     sched.removeExecutor("execD")
-    manager.executorLost("execD", "host1")
+    manager.executorLost("execD", "host1", SlaveLost())
     assert(manager.myLocalityLevels.sameElements(Array(NO_PREF, ANY)))
   }
 
+  test("Executors are added but exit normally while running tasks") {
+    sc = new SparkContext("local", "test")
+    val sched = new FakeTaskScheduler(sc)
+    val taskSet = FakeTask.createTaskSet(4,
+      Seq(TaskLocation("host1", "execA")),
+      Seq(TaskLocation("host1", "execB")),
+      Seq(TaskLocation("host2", "execC")),
+      Seq())
+    val manager = new TaskSetManager(sched, taskSet, 1, new ManualClock)
+    sched.addExecutor("execA", "host1")
+    manager.executorAdded()
+    sched.addExecutor("execC", "host2")
+    manager.executorAdded()
+    assert(manager.resourceOffer("exec1", "host1", ANY).isDefined)
+    sched.removeExecutor("execA")
+    manager.executorLost("execA", "host1", ExecutorExited(143, true, "Normal termination"))
+    assert(!sched.taskSetsFailed.contains(taskSet.id))
+    assert(manager.resourceOffer("execC", "host2", ANY).isDefined)
+    sched.removeExecutor("execC")
+    manager.executorLost("execC", "host2", ExecutorExited(1, false, "Abnormal termination"))
+    assert(sched.taskSetsFailed.contains(taskSet.id))
+  }
+
   test("test RACK_LOCAL tasks") {
     // Assign host1 to rack1
     FakeRackUtil.assignHostToRack("host1", "rack1")
@@ -721,8 +744,8 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(manager.resourceOffer("execB.2", "host2", ANY) !== None)
     sched.removeExecutor("execA")
     sched.removeExecutor("execB.2")
-    manager.executorLost("execA", "host1")
-    manager.executorLost("execB.2", "host2")
+    manager.executorLost("execA", "host1", SlaveLost())
+    manager.executorLost("execB.2", "host2", SlaveLost())
     clock.advance(LOCALITY_WAIT_MS * 4)
     sched.addExecutor("execC", "host3")
     manager.executorAdded()
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index 319b3173e7a6e..c4dc560031207 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -42,6 +42,38 @@ import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSui
 
 class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
 
+  test("Use configured mesosExecutor.cores for ExecutorInfo") {
+    val mesosExecutorCores = 3
+    val conf = new SparkConf
+    conf.set("spark.mesos.mesosExecutor.cores", mesosExecutorCores.toString)
+
+    val listenerBus = mock[LiveListenerBus]
+    listenerBus.post(
+      SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+    val sc = mock[SparkContext]
+    when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
+
+    when(sc.conf).thenReturn(conf)
+    when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+    when(sc.executorMemory).thenReturn(100)
+    when(sc.listenerBus).thenReturn(listenerBus)
+    val taskScheduler = mock[TaskSchedulerImpl]
+    when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+
+    val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master")
+
+    val resources = Arrays.asList(
+      mesosSchedulerBackend.createResource("cpus", 4),
+      mesosSchedulerBackend.createResource("mem", 1024))
+    // uri is null.
+    val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
+    val executorResources = executorInfo.getResourcesList
+    val cpus = executorResources.asScala.find(_.getName.equals("cpus")).get.getScalar.getValue
+
+    assert(cpus === mesosExecutorCores)
+  }
+
   test("check spark-class location correctly") {
     val conf = new SparkConf
     conf.set("spark.mesos.executor.home" , "/mesos-home")
@@ -263,7 +295,6 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
       .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
       .setHostname(s"host${id.toString}").build()
 
-
     val mesosOffers = new java.util.ArrayList[Offer]
     mesosOffers.add(offer)
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
index bc9f3708ed69d..87f25e7245e1f 100644
--- a/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
@@ -76,9 +76,9 @@ class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
   test("caches previously seen schemas") {
     val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
     val compressedSchema = genericSer.compress(schema)
-    val decompressedScheam = genericSer.decompress(ByteBuffer.wrap(compressedSchema))
+    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))
 
     assert(compressedSchema.eq(genericSer.compress(schema)))
-    assert(decompressedScheam.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
+    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
index 329a2b6dad831..20f45670bc2ba 100644
--- a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
@@ -25,4 +25,22 @@ class JavaSerializerSuite extends SparkFunSuite {
     val instance = serializer.newInstance()
     instance.deserialize[JavaSerializer](instance.serialize(serializer))
   }
+
+  test("Deserialize object containing a primitive Class as attribute") {
+    val serializer = new JavaSerializer(new SparkConf())
+    val instance = serializer.newInstance()
+    instance.deserialize[JavaSerializer](instance.serialize(new ContainsPrimitiveClass()))
+  }
+}
+
+private class ContainsPrimitiveClass extends Serializable {
+  val intClass = classOf[Int]
+  val longClass = classOf[Long]
+  val shortClass = classOf[Short]
+  val charClass = classOf[Char]
+  val doubleClass = classOf[Double]
+  val floatClass = classOf[Float]
+  val booleanClass = classOf[Boolean]
+  val byteClass = classOf[Byte]
+  val voidClass = classOf[Void]
 }
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
similarity index 96%
rename from core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
rename to core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
index 05b3afef5b839..26a372d6a905d 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.hash
+package org.apache.spark.shuffle
 
 import java.io.{ByteArrayOutputStream, InputStream}
 import java.nio.ByteBuffer
@@ -28,7 +28,6 @@ import org.mockito.stubbing.Answer
 import org.apache.spark._
 import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.serializer.JavaSerializer
-import org.apache.spark.shuffle.BaseShuffleHandle
 import org.apache.spark.storage.{BlockManager, BlockManagerId, ShuffleBlockId}
 
 /**
@@ -56,7 +55,7 @@ class RecordingManagedBuffer(underlyingBuffer: NioManagedBuffer) extends Managed
   }
 }
 
-class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
+class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
 
   /**
    * This test makes sure that, when data is read from a HashShuffleReader, the underlying
@@ -115,7 +114,7 @@ class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
     // Make a mocked MapOutputTracker for the shuffle reader to use to determine what
     // shuffle data to read.
     val mapOutputTracker = mock(classOf[MapOutputTracker])
-    when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId)).thenReturn {
+    when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1)).thenReturn {
       // Test a scenario where all data is local, to avoid creating a bunch of additional mocks
       // for the code to read data over the network.
       val shuffleBlockIdsAndSizes = (0 until numMaps).map { mapId =>
@@ -134,7 +133,7 @@ class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
       new BaseShuffleHandle(shuffleId, numMaps, dependency)
     }
 
-    val shuffleReader = new HashShuffleReader(
+    val shuffleReader = new BlockStoreShuffleReader(
       shuffleHandle,
       reduceId,
       reduceId + 1,
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala
new file mode 100644
index 0000000000000..4d5f599fb12ab
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDependencySuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.shuffle
+
+import org.apache.spark._
+
+case class KeyClass()
+
+case class ValueClass()
+
+case class CombinerClass()
+
+class ShuffleDependencySuite extends SparkFunSuite with LocalSparkContext {
+
+  val conf = new SparkConf(loadDefaults = false)
+
+  test("key, value, and combiner classes correct in shuffle dependency without aggregation") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val rdd = sc.parallelize(1 to 5, 4)
+      .map(key => (KeyClass(), ValueClass()))
+      .groupByKey()
+    val dep = rdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(!dep.mapSideCombine, "Test requires that no map-side aggregator is defined")
+    assert(dep.keyClassName == classOf[KeyClass].getName)
+    assert(dep.valueClassName == classOf[ValueClass].getName)
+  }
+
+  test("key, value, and combiner classes available in shuffle dependency with aggregation") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val rdd = sc.parallelize(1 to 5, 4)
+      .map(key => (KeyClass(), ValueClass()))
+      .aggregateByKey(CombinerClass())({ case (a, b) => a }, { case (a, b) => a })
+    val dep = rdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(dep.mapSideCombine && dep.aggregator.isDefined, "Test requires map-side aggregation")
+    assert(dep.keyClassName == classOf[KeyClass].getName)
+    assert(dep.valueClassName == classOf[ValueClass].getName)
+    assert(dep.combinerClassName == Some(classOf[CombinerClass].getName))
+  }
+
+  test("combineByKey null combiner class tag handled correctly") {
+    sc = new SparkContext("local", "test", conf.clone())
+    val rdd = sc.parallelize(1 to 5, 4)
+      .map(key => (KeyClass(), ValueClass()))
+      .combineByKey((v: ValueClass) => v,
+        (c: AnyRef, v: ValueClass) => c,
+        (c1: AnyRef, c2: AnyRef) => c1)
+    val dep = rdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    assert(dep.keyClassName == classOf[KeyClass].getName)
+    assert(dep.valueClassName == classOf[ValueClass].getName)
+    assert(dep.combinerClassName == None)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
deleted file mode 100644
index 491dc3659e184..0000000000000
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleManagerSuite.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.hash
-
-import java.io.{File, FileWriter}
-
-import scala.language.reflectiveCalls
-
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite}
-import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
-import org.apache.spark.serializer.JavaSerializer
-import org.apache.spark.shuffle.FileShuffleBlockResolver
-import org.apache.spark.storage.{ShuffleBlockId, FileSegment}
-
-class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext {
-  private val testConf = new SparkConf(false)
-
-  private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
-    assert(buffer.isInstanceOf[FileSegmentManagedBuffer])
-    val segment = buffer.asInstanceOf[FileSegmentManagedBuffer]
-    assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath)
-    assert(expected.offset === segment.getOffset)
-    assert(expected.length === segment.getLength)
-  }
-
-  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {
-
-    val conf = new SparkConf(false)
-    // reset after EACH object write. This is to ensure that there are bytes appended after
-    // an object is written. So if the codepaths assume writeObject is end of data, this should
-    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
-    conf.set("spark.serializer.objectStreamReset", "1")
-    conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
-
-    sc = new SparkContext("local", "test", conf)
-
-    val shuffleBlockResolver =
-      SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver]
-
-    val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf),
-      new ShuffleWriteMetrics)
-    for (writer <- shuffle1.writers) {
-      writer.write("test1", "value")
-      writer.write("test2", "value")
-    }
-    for (writer <- shuffle1.writers) {
-      writer.commitAndClose()
-    }
-
-    val shuffle1Segment = shuffle1.writers(0).fileSegment()
-    shuffle1.releaseWriters(success = true)
-
-    val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf),
-      new ShuffleWriteMetrics)
-
-    for (writer <- shuffle2.writers) {
-      writer.write("test3", "value")
-      writer.write("test4", "vlue")
-    }
-    for (writer <- shuffle2.writers) {
-      writer.commitAndClose()
-    }
-    val shuffle2Segment = shuffle2.writers(0).fileSegment()
-    shuffle2.releaseWriters(success = true)
-
-    // Now comes the test :
-    // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
-    // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
-    // of block based on remaining data in file : which could mess things up when there is
-    // concurrent read and writes happening to the same shuffle group.
-
-    val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf),
-      new ShuffleWriteMetrics)
-    for (writer <- shuffle3.writers) {
-      writer.write("test3", "value")
-      writer.write("test4", "value")
-    }
-    for (writer <- shuffle3.writers) {
-      writer.commitAndClose()
-    }
-    // check before we register.
-    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
-    shuffle3.releaseWriters(success = true)
-    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
-    shuffleBlockResolver.removeShuffle(1)
-  }
-
-  def writeToFile(file: File, numBytes: Int) {
-    val writer = new FileWriter(file, true)
-    for (i <- 0 until numBytes) writer.write(i)
-    writer.close()
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index cc7342f1ecd78..341f56df2dafc 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -72,7 +72,6 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
       override def answer(invocation: InvocationOnMock): DiskBlockObjectWriter = {
         val args = invocation.getArguments
         new DiskBlockObjectWriter(
-          args(0).asInstanceOf[BlockId],
           args(1).asInstanceOf[File],
           args(2).asInstanceOf[SerializerInstance],
           args(3).asInstanceOf[Int],
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index 0f5ba46f69c2f..eb5af70d57aec 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -26,10 +26,10 @@ import org.mockito.Mockito.{mock, when}
 import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
+import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
 import org.apache.spark.network.BlockTransferService
-import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -38,7 +38,7 @@ import org.apache.spark.storage.StorageLevel._
 /** Testsuite that tests block replication in BlockManager */
 class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
 
-  private val conf = new SparkConf(false)
+  private val conf = new SparkConf(false).set("spark.app.id", "test")
   var rpcEnv: RpcEnv = null
   var master: BlockManagerMaster = null
   val securityMgr = new SecurityManager(conf)
@@ -59,7 +59,7 @@ class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with Befo
   private def makeBlockManager(
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val store = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
       mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     store.initialize("app-id")
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index f480fd107a0c2..34bb4952e7246 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -30,10 +30,10 @@ import org.scalatest._
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts._
 
+import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
 import org.apache.spark.executor.DataReadMethod
-import org.apache.spark.network.nio.NioBlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -44,9 +44,10 @@ import org.apache.spark.util._
 class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach
   with PrivateMethodTester with ResetSystemProperties {
 
-  private val conf = new SparkConf(false)
+  private val conf = new SparkConf(false).set("spark.app.id", "test")
   var store: BlockManager = null
   var store2: BlockManager = null
+  var store3: BlockManager = null
   var rpcEnv: RpcEnv = null
   var master: BlockManagerMaster = null
   conf.set("spark.authenticate", "false")
@@ -65,7 +66,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
   private def makeBlockManager(
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val manager = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
       mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     manager.initialize("app-id")
@@ -99,6 +100,10 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
       store2.stop()
       store2 = null
     }
+    if (store3 != null) {
+      store3.stop()
+      store3 = null
+    }
     rpcEnv.shutdown()
     rpcEnv.awaitTermination()
     rpcEnv = null
@@ -443,6 +448,38 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(list2DiskGet.get.readMethod === DataReadMethod.Disk)
   }
 
+  test("SPARK-9591: getRemoteBytes from another location when Exception throw") {
+    val origTimeoutOpt = conf.getOption("spark.network.timeout")
+    try {
+      conf.set("spark.network.timeout", "2s")
+      store = makeBlockManager(8000, "executor1")
+      store2 = makeBlockManager(8000, "executor2")
+      store3 = makeBlockManager(8000, "executor3")
+      val list1 = List(new Array[Byte](4000))
+      store2.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      store3.putIterator("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
+      var list1Get = store.getRemoteBytes("list1")
+      assert(list1Get.isDefined, "list1Get expected to be fetched")
+      // block manager exit
+      store2.stop()
+      store2 = null
+      list1Get = store.getRemoteBytes("list1")
+      // get `list1` block
+      assert(list1Get.isDefined, "list1Get expected to be fetched")
+      store3.stop()
+      store3 = null
+      // exception throw because there is no locations
+      intercept[BlockFetchException] {
+        list1Get = store.getRemoteBytes("list1")
+      }
+    } finally {
+      origTimeoutOpt match {
+        case Some(t) => conf.set("spark.network.timeout", t)
+        case None => conf.remove("spark.network.timeout")
+      }
+    }
+  }
+
   test("in-memory LRU storage") {
     store = makeBlockManager(12000)
     val a1 = new Array[Byte](4000)
@@ -782,7 +819,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
 
   test("block store put failure") {
     // Use Java serializer so we can create an unserializable error.
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     store = new BlockManager(SparkContext.DRIVER_IDENTIFIER, rpcEnv, master,
       new JavaSerializer(conf), 1200, conf, mapOutputTracker, shuffleManager, transfer, securityMgr,
       0)
@@ -796,7 +833,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
 
     // Make sure get a1 doesn't hang and returns None.
     failAfter(1 second) {
-      assert(store.getSingle("a1") == None, "a1 should not be in store")
+      assert(store.getSingle("a1").isEmpty, "a1 should not be in store")
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 66af6e1a79740..7c19531c18802 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -20,7 +20,6 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterEach
 
-import org.apache.spark.SparkConf
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.serializer.JavaSerializer
@@ -41,8 +40,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("verify write metrics") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -63,8 +62,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("verify write metrics on revert") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -86,8 +85,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("Reopening a closed block writer") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
 
     writer.open()
     writer.close()
@@ -99,8 +98,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -115,8 +114,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("commitAndClose() should be idempotent") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -133,8 +132,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("revertPartialWritesAndClose() should be idempotent") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -151,8 +150,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("fileSegment() can only be called after commitAndClose() has been called") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -165,8 +164,8 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   test("commitAndClose() without ever opening or writing") {
     val file = new File(tempDir, "somefile")
     val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
-      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val writer = new DiskBlockObjectWriter(
+      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
     writer.commitAndClose()
     assert(writer.fileSegment().length === 0)
   }
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 3388c6dca81f1..86699e7f56953 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -23,7 +23,7 @@ import scala.xml.Node
 
 import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS}
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite, Success}
+import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.ui.jobs.{JobProgressListener, StagePage, StagesTab}
@@ -47,6 +47,14 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
     assert(html3.contains(targetString))
   }
 
+  test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
+    val unsafeConf = "spark.sql.unsafe.enabled"
+    val conf = new SparkConf(false).set(unsafeConf, "true")
+    val html = renderStagePage(conf).toString().toLowerCase
+    // verify min/25/50/75/max show task value not cumulative values
+    assert(html.contains("<td>10.0 b</td>" * 5))
+  }
+
   /**
    * Render a stage page started with the given conf and return the HTML.
    * This also runs a dummy stage to populate the page with useful content.
@@ -67,12 +75,19 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
     // Simulate a stage in job progress listener
     val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details")
-    val taskInfo = new TaskInfo(0, 0, 0, 0, "0", "localhost", TaskLocality.ANY, false)
-    jobListener.onStageSubmitted(SparkListenerStageSubmitted(stageInfo))
-    jobListener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo))
-    taskInfo.markSuccessful()
-    jobListener.onTaskEnd(
-      SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, TaskMetrics.empty))
+    // Simulate two tasks to test PEAK_EXECUTION_MEMORY correctness
+    (1 to 2).foreach {
+      taskId =>
+        val taskInfo = new TaskInfo(taskId, taskId, 0, 0, "0", "localhost", TaskLocality.ANY, false)
+        val peakExecutionMemory = 10
+        taskInfo.accumulables += new AccumulableInfo(0, InternalAccumulator.PEAK_EXECUTION_MEMORY,
+          Some(peakExecutionMemory.toString), (peakExecutionMemory * taskId).toString, true)
+        jobListener.onStageSubmitted(SparkListenerStageSubmitted(stageInfo))
+        jobListener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo))
+        taskInfo.markSuccessful()
+        jobListener.onTaskEnd(
+          SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, TaskMetrics.empty))
+    }
     jobListener.onStageCompleted(SparkListenerStageCompleted(stageInfo))
     page.render(request)
   }
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 22e30ecaf0533..18eec7da9763e 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -658,6 +658,6 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
   }
 
   def apiUrl(ui: SparkUI, path: String): URL = {
-    new URL(ui.appUIAddress + "/api/v1/applications/test/" + path)
+    new URL(ui.appUIAddress + "/api/v1/applications/" + ui.sc.get.applicationId + "/" + path)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
new file mode 100644
index 0000000000000..2b693c165180f
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui
+
+import scala.xml.Elem
+
+import org.apache.spark.SparkFunSuite
+
+class UIUtilsSuite extends SparkFunSuite {
+  import UIUtils._
+
+  test("makeDescription") {
+    verify(
+      """test <a href="/link"> text </a>""",
+      <span class="description-input">test <a href="/link"> text </a></span>,
+      "Correctly formatted text with only anchors and relative links should generate HTML"
+    )
+
+    verify(
+      """test <a href="/link" text </a>""",
+      <span class="description-input">{"""test <a href="/link" text </a>"""}</span>,
+      "Badly formatted text should make the description be treated as a streaming instead of HTML"
+    )
+
+    verify(
+      """test <a href="link"> text </a>""",
+      <span class="description-input">{"""test <a href="link"> text </a>"""}</span>,
+      "Non-relative links should make the description be treated as a string instead of HTML"
+    )
+
+    verify(
+      """test<a><img></img></a>""",
+      <span class="description-input">{"""test<a><img></img></a>"""}</span>,
+      "Non-anchor elements should make the description be treated as a string instead of HTML"
+    )
+
+    verify(
+      """test <a href="/link"> text </a>""",
+      <span class="description-input">test <a href="base/link"> text </a></span>,
+      baseUrl = "base",
+      errorMsg = "Base URL should be prepended to html links"
+    )
+  }
+
+  private def verify(
+      desc: String, expected: Elem, errorMsg: String = "", baseUrl: String = ""): Unit = {
+    val generated = makeDescription(desc, baseUrl)
+    assert(generated.sameElements(expected),
+      s"\n$errorMsg\n\nExpected:\n$expected\nGenerated:\n$generated")
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 343a4139b0ca8..a24bf2931cca0 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -151,7 +151,8 @@ class JsonProtocolSuite extends SparkFunSuite {
     testTaskEndReason(exceptionFailure)
     testTaskEndReason(TaskResultLost)
     testTaskEndReason(TaskKilled)
-    testTaskEndReason(ExecutorLostFailure("100"))
+    testTaskEndReason(TaskCommitDenied(2, 3, 4))
+    testTaskEndReason(ExecutorLostFailure("100", true))
     testTaskEndReason(UnknownReason)
 
     // BlockId
@@ -295,10 +296,10 @@ class JsonProtocolSuite extends SparkFunSuite {
 
   test("ExecutorLostFailure backward compatibility") {
     // ExecutorLostFailure in Spark 1.1.0 does not have an "Executor ID" property.
-    val executorLostFailure = ExecutorLostFailure("100")
+    val executorLostFailure = ExecutorLostFailure("100", true)
     val oldEvent = JsonProtocol.taskEndReasonToJson(executorLostFailure)
       .removeField({ _._1 == "Executor ID" })
-    val expectedExecutorLostFailure = ExecutorLostFailure("Unknown")
+    val expectedExecutorLostFailure = ExecutorLostFailure("Unknown", true)
     assert(expectedExecutorLostFailure === JsonProtocol.taskEndReasonFromJson(oldEvent))
   }
 
@@ -352,6 +353,17 @@ class JsonProtocolSuite extends SparkFunSuite {
     assertEquals(expectedStageInfo, JsonProtocol.stageInfoFromJson(oldStageInfo))
   }
 
+  // `TaskCommitDenied` was added in 1.3.0 but JSON de/serialization logic was added in 1.5.1
+  test("TaskCommitDenied backward compatibility") {
+    val denied = TaskCommitDenied(1, 2, 3)
+    val oldDenied = JsonProtocol.taskEndReasonToJson(denied)
+      .removeField({ _._1 == "Job ID" })
+      .removeField({ _._1 == "Partition ID" })
+      .removeField({ _._1 == "Attempt Number" })
+    val expectedDenied = TaskCommitDenied(-1, -1, -1)
+    assertEquals(expectedDenied, JsonProtocol.taskEndReasonFromJson(oldDenied))
+  }
+
   /** -------------------------- *
    | Helper test running methods |
    * --------------------------- */
@@ -499,7 +511,7 @@ class JsonProtocolSuite extends SparkFunSuite {
   private def assertEquals(info1: TaskInfo, info2: TaskInfo) {
     assert(info1.taskId === info2.taskId)
     assert(info1.index === info2.index)
-    assert(info1.attempt === info2.attempt)
+    assert(info1.attemptNumber === info2.attemptNumber)
     assert(info1.launchTime === info2.launchTime)
     assert(info1.executorId === info2.executorId)
     assert(info1.host === info2.host)
@@ -577,8 +589,15 @@ class JsonProtocolSuite extends SparkFunSuite {
         assertOptionEquals(r1.metrics, r2.metrics, assertTaskMetricsEquals)
       case (TaskResultLost, TaskResultLost) =>
       case (TaskKilled, TaskKilled) =>
-      case (ExecutorLostFailure(execId1), ExecutorLostFailure(execId2)) =>
+      case (TaskCommitDenied(jobId1, partitionId1, attemptNumber1),
+          TaskCommitDenied(jobId2, partitionId2, attemptNumber2)) =>
+        assert(jobId1 === jobId2)
+        assert(partitionId1 === partitionId2)
+        assert(attemptNumber1 === attemptNumber2)
+      case (ExecutorLostFailure(execId1, isNormalExit1),
+          ExecutorLostFailure(execId2, isNormalExit2)) =>
         assert(execId1 === execId2)
+        assert(isNormalExit1 === isNormalExit2)
       case (UnknownReason, UnknownReason) =>
       case _ => fail("Task end reasons don't match in types!")
     }
diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
index 8c51e6b14b7fc..620e4debf4e08 100644
--- a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
@@ -20,8 +20,9 @@ package org.apache.spark.util
 
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
-import scala.concurrent.{Await, Future}
 import scala.concurrent.duration._
+import scala.concurrent.{Await, Future}
+import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
 
@@ -66,4 +67,25 @@ class ThreadUtilsSuite extends SparkFunSuite {
     val futureThreadName = Await.result(f, 10.seconds)
     assert(futureThreadName === callerThreadName)
   }
+
+  test("runInNewThread") {
+    import ThreadUtils._
+    assert(runInNewThread("thread-name") { Thread.currentThread().getName } === "thread-name")
+    assert(runInNewThread("thread-name") { Thread.currentThread().isDaemon } === true)
+    assert(
+      runInNewThread("thread-name", isDaemon = false) { Thread.currentThread().isDaemon } === false
+    )
+    val uniqueExceptionMessage = "test" + Random.nextInt()
+    val exception = intercept[IllegalArgumentException] {
+      runInNewThread("thread-name") { throw new IllegalArgumentException(uniqueExceptionMessage) }
+    }
+    assert(exception.asInstanceOf[IllegalArgumentException].getMessage === uniqueExceptionMessage)
+    assert(exception.getStackTrace.mkString("\n").contains(
+      "... run in separate thread using org.apache.spark.util.ThreadUtils ...") === true,
+      "stack trace does not contain expected place holder"
+    )
+    assert(exception.getStackTrace.mkString("\n").contains("ThreadUtils.scala") === false,
+      "stack trace contains unexpected references to ThreadUtils"
+    )
+  }
 }
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index 8aaa250bd7e29..db9c680a4bad3 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -178,13 +178,16 @@ def populate(issue_type, components):
             author_info[author][issue_type].add(component)
     # Find issues and components associated with this commit
     for issue in issues:
-        jira_issue = jira_client.issue(issue)
-        jira_type = jira_issue.fields.issuetype.name
-        jira_type = translate_issue_type(jira_type, issue, warnings)
-        jira_components = [translate_component(c.name, _hash, warnings)\
-          for c in jira_issue.fields.components]
-        all_components = set(jira_components + commit_components)
-        populate(jira_type, all_components)
+        try:
+            jira_issue = jira_client.issue(issue)
+            jira_type = jira_issue.fields.issuetype.name
+            jira_type = translate_issue_type(jira_type, issue, warnings)
+            jira_components = [translate_component(c.name, _hash, warnings)\
+              for c in jira_issue.fields.components]
+            all_components = set(jira_components + commit_components)
+            populate(jira_type, all_components)
+        except Exception as e:
+            print "Unexpected error:", e
     # For docs without an associated JIRA, manually add it ourselves
     if is_docs(title) and not issues:
         populate("documentation", commit_components)
@@ -223,7 +226,8 @@ def populate(issue_type, components):
     # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
     if author in invalid_authors and invalid_authors[author]:
         author = author + "/" + "/".join(invalid_authors[author])
-    line = " * %s -- %s" % (author, contribution)
+    #line = " * %s -- %s" % (author, contribution)
+    line = author
     contributors_file.write(line + "\n")
 contributors_file.close()
 print "Contributors list is successfully written to %s!" % contributors_file_name
diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
index e462302f28423..3563fe3cc3c03 100644
--- a/dev/create-release/known_translations
+++ b/dev/create-release/known_translations
@@ -138,3 +138,30 @@ lee19 - Lee
 lockwobr - Brian Lockwood
 navis - Navis Ryu
 pparkkin - Paavo Parkkinen
+HyukjinKwon - Hyukjin Kwon
+JDrit - Joseph Batchik
+JuhongPark - Juhong Park
+KaiXinXiaoLei - KaiXinXIaoLei
+NamelessAnalyst - NamelessAnalyst
+alyaxey - Alex Slusarenko
+baishuo - Shuo Bai
+fe2s - Oleksiy Dyagilev
+felixcheung - Felix Cheung
+feynmanliang - Feynman Liang
+josepablocam - Jose Cambronero
+kai-zeng - Kai Zeng
+mosessky - mosessky
+msannell - Michael Sannella
+nishkamravi2 - Nishkam Ravi
+noel-smith - Noel Smith
+petz2000 - Patrick Baier
+qiansl127 - Shilei Qian
+rahulpalamuttam - Rahul Palamuttam
+rowan000 - Rowan Chattaway
+sarutak - Kousuke Saruta
+sethah - Seth Hendrickson
+small-wang - Wang Wei
+stanzhai - Stan Zhai
+tien-dungle - Tien-Dung Le
+xuchenCN - Xu Chen
+zhangjiajin - Zhang JiaJin
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index d0b3a54dde1dc..9dac43ce54425 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -99,6 +99,7 @@ fi
 DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION"
 USER_HOST="$ASF_USERNAME@people.apache.org"
 
+git clean -d -f -x
 rm .gitignore
 rm -rf .git
 cd ..
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 51ab25a6a5bd8..7f152b7f53559 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -24,7 +24,11 @@
 
 try:
     from jira.client import JIRA
-    from jira.exceptions import JIRAError
+    # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
+    try:
+        from jira.exceptions import JIRAError
+    except ImportError:
+        from jira.utils import JIRAError
 except ImportError:
     print "This tool requires the jira-python library"
     print "Install using 'sudo pip install jira'"
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 39cf54f78104c..d3b05fa6df0ce 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -116,39 +116,6 @@ function post_message () {
   fi
 }
 
-function send_archived_logs () {
-  echo "Archiving unit tests logs..."
-
-  local log_files=$(
-    find .\
-      -name "unit-tests.log" -o\
-      -path "./sql/hive/target/HiveCompatibilitySuite.failed" -o\
-      -path "./sql/hive/target/HiveCompatibilitySuite.hiveFailed" -o\
-      -path "./sql/hive/target/HiveCompatibilitySuite.wrong"
-  )
-
-  if [ -z "$log_files" ]; then
-    echo "> No log files found." >&2
-  else
-    local log_archive="unit-tests-logs.tar.gz"
-    echo "$log_files" | xargs tar czf ${log_archive}
-
-    local jenkins_build_dir=${JENKINS_HOME}/jobs/${JOB_NAME}/builds/${BUILD_NUMBER}
-    local scp_output=$(scp ${log_archive} amp-jenkins-master:${jenkins_build_dir}/${log_archive})
-    local scp_status="$?"
-
-    if [ "$scp_status" -ne 0 ]; then
-      echo "Failed to send archived unit tests logs to Jenkins master." >&2
-      echo "> scp_status: ${scp_status}" >&2
-      echo "> scp_output: ${scp_output}" >&2
-    else
-      echo "> Send successful."
-    fi
-
-    rm -f ${log_archive}
-  fi
-}
-
 # post start message
 {
   start_message="\
@@ -164,8 +131,9 @@ pr_message=""
 current_pr_head="`git rev-parse HEAD`"
 
 echo "HEAD:  `git rev-parse HEAD`"
-echo "GHPRB: $ghprbActualCommit"
-echo "SHA1:  $sha1"
+echo "\$ghprbActualCommit: $ghprbActualCommit"
+echo "\$sha1:  $sha1"
+echo "\$ghprbPullTitle: $ghprbPullTitle"
 
 # Run pull request tests
 for t in "${PR_TESTS[@]}"; do
@@ -189,6 +157,19 @@ done
 {
   # Marks this build is a pull request build.
   export AMP_JENKINS_PRB=true
+  if [[ $ghprbPullTitle == *"test-maven"* ]]; then
+    export AMPLAB_JENKINS_BUILD_TOOL="maven"
+  fi
+  if [[ $ghprbPullTitle == *"test-hadoop1.0"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop1.0"
+  elif [[ $ghprbPullTitle == *"test-hadoop2.0"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.0"
+  elif [[ $ghprbPullTitle == *"test-hadoop2.2"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.2"
+  elif [[ $ghprbPullTitle == *"test-hadoop2.3"* ]]; then
+    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.3"
+  fi
+
   timeout "${TESTS_TIMEOUT}" ./dev/run-tests
   test_result="$?"
 
@@ -230,8 +211,6 @@ done
 
     test_result_note=" * This patch **fails $failing_test**."
   fi
-
-  send_archived_logs
 }
 
 # post end message
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 4fd703a7c219f..d8b22e1665e7b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -21,6 +21,7 @@
 import itertools
 from optparse import OptionParser
 import os
+import random
 import re
 import sys
 import subprocess
@@ -239,11 +240,32 @@ def build_spark_documentation():
     os.chdir(SPARK_HOME)
 
 
+def get_zinc_port():
+    """
+    Get a randomized port on which to start Zinc
+    """
+    return random.randrange(3030, 4030)
+
+
+def kill_zinc_on_port(zinc_port):
+    """
+    Kill the Zinc process running on the given port, if one exists.
+    """
+    cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN "
+           "| awk '{ print $2; }' | xargs kill") % zinc_port
+    subprocess.check_call(cmd, shell=True)
+
+
 def exec_maven(mvn_args=()):
     """Will call Maven in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
-    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
+    zinc_port = get_zinc_port()
+    os.environ["ZINC_PORT"] = "%s" % zinc_port
+    zinc_flag = "-DzincPort=%s" % zinc_port
+    flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag]
+    run_cmd(flags + mvn_args)
+    kill_zinc_on_port(zinc_port)
 
 
 def exec_sbt(sbt_args=()):
@@ -514,7 +536,9 @@ def main():
     build_apache_spark(build_tool, hadoop_version)
 
     # backwards compatibility checks
-    detect_binary_inop_with_mima()
+    if build_tool == "sbt":
+        # Note: compatiblity tests only supported in sbt for now
+        detect_binary_inop_with_mima()
 
     # run the test suites
     run_scala_tests(build_tool, hadoop_version, test_modules)
diff --git a/docker/spark-mesos/Dockerfile b/docker/spark-mesos/Dockerfile
index b90aef3655dee..fb3f267fe5c78 100644
--- a/docker/spark-mesos/Dockerfile
+++ b/docker/spark-mesos/Dockerfile
@@ -24,7 +24,7 @@ RUN apt-get update && \
     apt-get install -y python libnss3 openjdk-7-jre-headless curl
 
 RUN mkdir /opt/spark && \
-    curl http://www.apache.org/dyn/closer.cgi/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.4.tgz \
+    curl http://www.apache.org/dyn/closer.lua/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.4.tgz \
     | tar -xzC /opt
 ENV SPARK_HOME /opt/spark
 ENV MESOS_NATIVE_JAVA_LIBRARY /usr/local/lib/libmesos.so
diff --git a/docs/_config.yml b/docs/_config.yml
index c0e031a83ba9c..c59cc465ef89d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 1.5.0-SNAPSHOT
-SPARK_VERSION_SHORT: 1.5.0
+SPARK_VERSION: 1.6.0-SNAPSHOT
+SPARK_VERSION_SHORT: 1.6.0
 SCALA_BINARY_VERSION: "2.10"
 SCALA_VERSION: "2.10.4"
 MESOS_VERSION: 0.21.0
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 15ceda11a8a80..01718d98dffe0 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -26,12 +26,15 @@
     curr_dir = pwd
     cd("..")
 
-    puts "Running 'build/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..."
-    puts `build/sbt -Pkinesis-asl compile unidoc`
+    puts "Running 'build/sbt -Pkinesis-asl clean compile unidoc' from " + pwd + "; this may take a few minutes..."
+    puts `build/sbt -Pkinesis-asl clean compile unidoc`
 
     puts "Moving back into docs dir."
     cd("docs")
 
+    puts "Removing old docs"
+    puts `rm -rf api`
+
     # Copy over the unified ScalaDoc for all projects to api/scala.
     # This directory will be copied over to _site when `jekyll` command is run.
     source = "../target/scala-2.10/unidoc"
diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md
index c2fe6b0e286ce..347ca4a7af989 100644
--- a/docs/bagel-programming-guide.md
+++ b/docs/bagel-programming-guide.md
@@ -4,7 +4,7 @@ displayTitle: Bagel Programming Guide
 title: Bagel
 ---
 
-**Bagel will soon be superseded by [GraphX](graphx-programming-guide.html); we recommend that new users try GraphX instead.**
+**Bagel is deprecated, and superseded by [GraphX](graphx-programming-guide.html).**
 
 Bagel is a Spark implementation of Google's [Pregel](http://portal.acm.org/citation.cfm?id=1807184) graph processing framework. Bagel currently supports basic graph computation, combiners, and aggregators.
 
@@ -157,11 +157,3 @@ trait Message[K] {
   def targetId: K
 }
 {% endhighlight %}
-
-# Where to Go from Here
-
-Two example jobs, PageRank and shortest path, are included in `examples/src/main/scala/org/apache/spark/examples/bagel`. You can run them by passing the class name to the `bin/run-example` script included in Spark; e.g.:
-
-    ./bin/run-example org.apache.spark.examples.bagel.WikipediaPageRank
-
-Each example program prints usage help when run without any arguments.
diff --git a/docs/building-spark.md b/docs/building-spark.md
index f133eb96d9a21..4db32cfd628bc 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -61,12 +61,13 @@ If you don't run this, you may see errors like the following:
 You can fix this by setting the `MAVEN_OPTS` variable as discussed before.
 
 **Note:**
-* *For Java 8 and above this step is not required.*
-* *If using `build/mvn` and `MAVEN_OPTS` were not already set, the script will automate this for you.*
+
+* For Java 8 and above this step is not required.
+* If using `build/mvn` with no `MAVEN_OPTS` set, the script will automate this for you.
 
 # Specifying the Hadoop Version
 
-Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the "hadoop.version" property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
+Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the `hadoop.version` property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
 
 <table class="table">
   <thead>
@@ -91,7 +92,7 @@ mvn -Dhadoop.version=1.2.1 -Phadoop-1 -DskipTests clean package
 mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -Phadoop-1 -DskipTests clean package
 {% endhighlight %}
 
-You can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". Spark only supports YARN versions 2.2.0 and later.
+You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later.
 
 Examples:
 
@@ -125,7 +126,7 @@ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -Dskip
 # Building for Scala 2.11
 To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` property:
 
-    dev/change-scala-version.sh 2.11
+    ./dev/change-scala-version.sh 2.11
     mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package
 
 Spark does not yet support its JDBC component for Scala 2.11.
@@ -163,11 +164,9 @@ the `spark-parent` module).
 
 Thus, the full flow for running continuous-compilation of the `core` submodule may look more like:
 
-```
- $ mvn install
- $ cd core
- $ mvn scala:cc
-```
+    $ mvn install
+    $ cd core
+    $ mvn scala:cc
 
 # Building Spark with IntelliJ IDEA or Eclipse
 
@@ -193,11 +192,11 @@ then ship it over to the cluster. We are investigating the exact cause for this.
 
 # Packaging without Hadoop Dependencies for YARN
 
-The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
+The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with `yarn.application.classpath`.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
 
 # Building with SBT
 
-Maven is the official recommendation for packaging Spark, and is the "build of reference".
+Maven is the official build tool recommended for packaging Spark, and is the *build of reference*.
 But SBT is supported for day-to-day development since it can provide much faster iterative
 compilation. More advanced developers may wish to use SBT.
 
diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index 7079de546e2f5..faaf154d243f5 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -5,18 +5,19 @@ title: Cluster Mode Overview
 
 This document gives a short overview of how Spark runs on clusters, to make it easier to understand
 the components involved. Read through the [application submission guide](submitting-applications.html)
-to submit applications to a cluster.
+to learn about launching applications on a cluster.
 
 # Components
 
-Spark applications run as independent sets of processes on a cluster, coordinated by the SparkContext
+Spark applications run as independent sets of processes on a cluster, coordinated by the `SparkContext`
 object in your main program (called the _driver program_).
+
 Specifically, to run on a cluster, the SparkContext can connect to several types of _cluster managers_
-(either Spark's own standalone cluster manager or Mesos/YARN), which allocate resources across
+(either Spark's own standalone cluster manager, Mesos or YARN), which allocate resources across
 applications. Once connected, Spark acquires *executors* on nodes in the cluster, which are
 processes that run computations and store data for your application.
 Next, it sends your application code (defined by JAR or Python files passed to SparkContext) to
-the executors. Finally, SparkContext sends *tasks* for the executors to run.
+the executors. Finally, SparkContext sends *tasks* to the executors to run.
 
 <p style="text-align: center;">
   <img src="img/cluster-overview.png" title="Spark cluster components" alt="Spark cluster components" />
@@ -33,9 +34,9 @@ There are several useful things to note about this architecture:
 2. Spark is agnostic to the underlying cluster manager. As long as it can acquire executor
    processes, and these communicate with each other, it is relatively easy to run it even on a
    cluster manager that also supports other applications (e.g. Mesos/YARN).
-3. The driver program must listen for and accept incoming connections from its executors throughout 
-   its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config 
-   section](configuration.html#networking)). As such, the driver program must be network 
+3. The driver program must listen for and accept incoming connections from its executors throughout
+   its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config
+   section](configuration.html#networking)). As such, the driver program must be network
    addressable from the worker nodes.
 4. Because the driver schedules tasks on the cluster, it should be run close to the worker
    nodes, preferably on the same local area network. If you'd like to send requests to the
diff --git a/docs/configuration.md b/docs/configuration.md
index 77c5cbc7b3196..d99092e3c506e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -34,20 +34,20 @@ val conf = new SparkConf()
 val sc = new SparkContext(conf)
 {% endhighlight %}
 
-Note that we can have more than 1 thread in local mode, and in cases like Spark Streaming, we may 
+Note that we can have more than 1 thread in local mode, and in cases like Spark Streaming, we may
 actually require one to prevent any sort of starvation issues.
 
-Properties that specify some time duration should be configured with a unit of time. 
+Properties that specify some time duration should be configured with a unit of time.
 The following format is accepted:
- 
+
     25ms (milliseconds)
     5s (seconds)
     10m or 10min (minutes)
     3h (hours)
     5d (days)
     1y (years)
-    
-    
+
+
 Properties that specify a byte size should be configured with a unit of size.  
 The following format is accepted:
 
@@ -69,7 +69,7 @@ val sc = new SparkContext(new SparkConf())
 
 Then, you can supply configuration values at runtime:
 {% highlight bash %}
-./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false
+./bin/spark-submit --name "My app" --master local[4] --conf spark.eventLog.enabled=false
   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar
 {% endhighlight %}
 
@@ -140,7 +140,7 @@ of the most common options to set are:
   <td>
     Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
     (e.g. <code>1g</code>, <code>2g</code>).
-    
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
     Instead, please set this through the <code>--driver-memory</code> command line option
@@ -207,7 +207,7 @@ Apart from these, the following properties are also available, and may be useful
 
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-class-path</code> command line option or in 
+    Instead, please set this through the <code>--driver-class-path</code> command line option or in
     your default properties file.</td>
   </td>
 </tr>
@@ -216,10 +216,10 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     A string of extra JVM options to pass to the driver. For instance, GC settings or other logging.
-    
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-java-options</code> command line option or in 
+    Instead, please set this through the <code>--driver-java-options</code> command line option or in
     your default properties file.</td>
   </td>
 </tr>
@@ -228,10 +228,10 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     Set a special library path to use when launching the driver JVM.
-    
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-library-path</code> command line option or in 
+    Instead, please set this through the <code>--driver-library-path</code> command line option or in
     your default properties file.</td>
   </td>
 </tr>
@@ -242,7 +242,7 @@ Apart from these, the following properties are also available, and may be useful
     (Experimental) Whether to give user-added jars precedence over Spark's own jars when loading
     classes in the the driver. This feature can be used to mitigate conflicts between Spark's
     dependencies and user dependencies. It is currently an experimental feature.
-    
+
     This is used in cluster mode only.
   </td>
 </tr>
@@ -250,8 +250,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.executor.extraClassPath</code></td>
   <td>(none)</td>
   <td>
-    Extra classpath entries to prepend to the classpath of executors. This exists primarily for 
-    backwards-compatibility with older versions of Spark. Users typically should not need to set 
+    Extra classpath entries to prepend to the classpath of executors. This exists primarily for
+    backwards-compatibility with older versions of Spark. Users typically should not need to set
     this option.
   </td>
 </tr>
@@ -259,9 +259,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.executor.extraJavaOptions</code></td>
   <td>(none)</td>
   <td>
-    A string of extra JVM options to pass to executors. For instance, GC settings or other logging. 
-    Note that it is illegal to set Spark properties or heap size settings with this option. Spark 
-    properties should be set using a SparkConf object or the spark-defaults.conf file used with the 
+    A string of extra JVM options to pass to executors. For instance, GC settings or other logging.
+    Note that it is illegal to set Spark properties or heap size settings with this option. Spark
+    properties should be set using a SparkConf object or the spark-defaults.conf file used with the
     spark-submit script. Heap size settings can be set with spark.executor.memory.
   </td>
 </tr>
@@ -305,7 +305,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>daily</td>
   <td>
     Set the time interval by which the executor logs will be rolled over.
-    Rolling is disabled by default. Valid values are `daily`, `hourly`, `minutely` or
+    Rolling is disabled by default. Valid values are <code>daily</code>, <code>hourly<code>, <code>minutely<code> or
     any interval in seconds. See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
     for automatic cleaning of old logs.
   </td>
@@ -330,13 +330,13 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.python.profile</code></td>
   <td>false</td>
   <td>
-    Enable profiling in Python worker, the profile result will show up by `sc.show_profiles()`,
+    Enable profiling in Python worker, the profile result will show up by <code>sc.show_profiles()<code>,
     or it will be displayed before the driver exiting. It also can be dumped into disk by
-    `sc.dump_profiles(path)`. If some of the profile results had been displayed manually,
+    <code>sc.dump_profiles(path)<code>. If some of the profile results had been displayed manually,
     they will not be displayed automatically before driver exiting.
 
-    By default the `pyspark.profiler.BasicProfiler` will be used, but this can be overridden by
-    passing a profiler class in as a parameter to the `SparkContext` constructor.
+    By default the <code>pyspark.profiler.BasicProfiler<code> will be used, but this can be overridden by
+    passing a profiler class in as a parameter to the <code>SparkContext<code> constructor.
   </td>
 </tr>
 <tr>
@@ -382,17 +382,6 @@ Apart from these, the following properties are also available, and may be useful
     overhead per reduce task, so keep it small unless you have a large amount of memory.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.blockTransferService</code></td>
-  <td>netty</td>
-  <td>
-    Implementation to use for transferring shuffle and cached blocks between executors. There
-    are two implementations available: <code>netty</code> and <code>nio</code>. Netty-based
-    block transfer is intended to be simpler but equally efficient and is the default option
-    starting in 1.2, and <code>nio</code> block transfer is deprecated in Spark 1.5.0 and will
-    be removed in Spark 1.6.0.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.compress</code></td>
   <td>true</td>
@@ -401,16 +390,6 @@ Apart from these, the following properties are also available, and may be useful
     <code>spark.io.compression.codec</code>.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.consolidateFiles</code></td>
-  <td>false</td>
-  <td>
-    If set to "true", consolidates intermediate files created during a shuffle. Creating fewer
-    files can improve filesystem performance for shuffles with large numbers of reduce tasks. It
-    is recommended to set this to "true" when using ext4 or xfs filesystems. On ext3, this option
-    might degrade performance on machines with many (>8) cores due to filesystem limitations.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.file.buffer</code></td>
   <td>32k</td>
@@ -458,17 +437,20 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.manager</code></td>
   <td>sort</td>
   <td>
-    Implementation to use for shuffling data. There are two implementations available:
-    <code>sort</code> and <code>hash</code>. Sort-based shuffle is more memory-efficient and is
-    the default option starting in 1.2.
+    Implementation to use for shuffling data. There are three implementations available:
+    <code>sort</code>, <code>hash</code> and the new (1.5+) <code>tungsten-sort</code>.
+    Sort-based shuffle is more memory-efficient and is the default option starting in 1.2.
+    Tungsten-sort is similar to the sort based shuffle, with a direct binary cache-friendly
+    implementation with a fall back to regular sort based shuffle if its requirements are not
+    met.
   </td>
 </tr>
 <tr>
   <td><code>spark.shuffle.memoryFraction</code></td>
   <td>0.2</td>
   <td>
-    Fraction of Java heap to use for aggregation and cogroups during shuffles, if
-    <code>spark.shuffle.spill</code> is true. At any given time, the collective size of
+    Fraction of Java heap to use for aggregation and cogroups during shuffles.
+    At any given time, the collective size of
     all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
     begin to spill to disk. If spills are often, consider increasing this value at the expense of
     <code>spark.storage.memoryFraction</code>.
@@ -478,11 +460,11 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.service.enabled</code></td>
   <td>false</td>
   <td>
-    Enables the external shuffle service. This service preserves the shuffle files written by 
-    executors so the executors can be safely removed. This must be enabled if 
+    Enables the external shuffle service. This service preserves the shuffle files written by
+    executors so the executors can be safely removed. This must be enabled if
     <code>spark.dynamicAllocation.enabled</code> is "true". The external shuffle service
     must be set up in order to enable it. See
-    <a href="job-scheduling.html#configuration-and-setup">dynamic allocation 
+    <a href="job-scheduling.html#configuration-and-setup">dynamic allocation
     configuration and setup documentation</a> for more information.
   </td>
 </tr>
@@ -501,14 +483,6 @@ Apart from these, the following properties are also available, and may be useful
     map-side aggregation and there are at most this many reduce partitions.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.spill</code></td>
-  <td>true</td>
-  <td>
-    If set to "true", limits the amount of memory used during reduces by spilling data out to disk.
-    This spilling threshold is specified by <code>spark.shuffle.memoryFraction</code>.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.spill.compress</code></td>
   <td>true</td>
@@ -773,9 +747,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>1 in YARN mode, all the available cores on the worker in standalone mode.</td>
   <td>
     The number of cores to use on each executor. For YARN and standalone mode only.
-    
-    In standalone mode, setting this parameter allows an application to run multiple executors on 
-    the same worker, provided that there are enough cores on that worker. Otherwise, only one 
+
+    In standalone mode, setting this parameter allows an application to run multiple executors on
+    the same worker, provided that there are enough cores on that worker. Otherwise, only one
     executor per application will run on each worker.
   </td>
 </tr>
@@ -919,14 +893,14 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.akka.heartbeat.interval</code></td>
   <td>1000s</td>
   <td>
-    This is set to a larger value to disable the transport failure detector that comes built in to 
-    Akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger 
-    interval value reduces network overhead and a smaller value ( ~ 1 s) might be more 
-    informative for Akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` 
-    if you need to. A likely positive use case for using failure detector would be: a sensistive 
-    failure detector can help evict rogue executors quickly. However this is usually not the case 
-    as GC pauses and network lags are expected in a real Spark cluster. Apart from that enabling 
-    this leads to a lot of exchanges of heart beats between nodes leading to flooding the network 
+    This is set to a larger value to disable the transport failure detector that comes built in to
+    Akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger
+    interval value reduces network overhead and a smaller value ( ~ 1 s) might be more
+    informative for Akka's failure detector. Tune this in combination of <code>spark.akka.heartbeat.pauses</code>
+    if you need to. A likely positive use case for using failure detector would be: a sensistive
+    failure detector can help evict rogue executors quickly. However this is usually not the case
+    as GC pauses and network lags are expected in a real Spark cluster. Apart from that enabling
+    this leads to a lot of exchanges of heart beats between nodes leading to flooding the network
     with those.
   </td>
 </tr>
@@ -935,9 +909,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>6000s</td>
   <td>
      This is set to a larger value to disable the transport failure detector that comes built in to Akka.
-     It can be enabled again, if you plan to use this feature (Not recommended). Acceptable heart 
+     It can be enabled again, if you plan to use this feature (Not recommended). Acceptable heart
      beat pause for Akka. This can be used to control sensitivity to GC pauses. Tune
-     this along with `spark.akka.heartbeat.interval` if you need to.
+     this along with <code>spark.akka.heartbeat.interval</code> if you need to.
   </td>
 </tr>
 <tr>
@@ -1004,7 +978,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.network.timeout</code></td>
   <td>120s</td>
   <td>
-    Default timeout for all network interactions. This config will be used in place of 
+    Default timeout for all network interactions. This config will be used in place of
     <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>,
     <code>spark.storage.blockManagerSlaveTimeoutMs</code>,
     <code>spark.shuffle.io.connectionTimeout</code>, <code>spark.rpc.askTimeout</code> or
@@ -1015,7 +989,11 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.port.maxRetries</code></td>
   <td>16</td>
   <td>
-    Default maximum number of retries when binding to a port before giving up.
+    Maximum number of retries when binding to a port before giving up.
+    When a port is given a specific value (non 0), each subsequent retry will
+    increment the port used in the previous attempt by 1 before retrying. This
+    essentially allows it to try a range of ports from the start port specified
+    to port + maxRetries.
   </td>
 </tr>
 <tr>
@@ -1116,10 +1094,11 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.scheduler.minRegisteredResourcesRatio</code></td>
-  <td>0.8 for YARN mode; 0.0 otherwise</td>
+  <td>0.8 for YARN mode; 0.0 for standalone mode and Mesos coarse-grained mode</td>
   <td>
     The minimum ratio of registered resources (registered resources / total expected resources)
-    (resources are executors in yarn mode, CPU cores in standalone mode)
+    (resources are executors in yarn mode, CPU cores in standalone mode and Mesos coarsed-grained
+     mode ['spark.cores.max' value is total expected resources for Mesos coarse-grained mode] )
     to wait for before scheduling begins. Specified as a double between 0.0 and 1.0.
     Regardless of whether the minimum ratio of resources has been reached,
     the maximum amount of time it will wait before scheduling begins is controlled by config
@@ -1212,7 +1191,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
   <td>60s</td>
   <td>
-    If dynamic allocation is enabled and an executor has been idle for more than this duration, 
+    If dynamic allocation is enabled and an executor has been idle for more than this duration,
     the executor will be removed. For more detail, see this
     <a href="job-scheduling.html#resource-allocation-policy">description</a>.
   </td>
@@ -1286,7 +1265,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Comma separated list of users/administrators that have view and modify access to all Spark jobs.
     This can be used if you run on a shared cluster and have a set of administrators or devs who
-    help debug when things work.
+    help debug when things work. Putting a "*" in the list means any user can have the priviledge
+    of admin.
   </td>
 </tr>
 <tr>
@@ -1305,6 +1285,22 @@ Apart from these, the following properties are also available, and may be useful
     not running on YARN and authentication is enabled.
   </td>
 </tr>
+<tr>
+  <td><code>spark.authenticate.enableSaslEncryption</code></td>
+  <td>false</td>
+  <td>
+    Enable encrypted communication when authentication is enabled. This option is currently
+    only supported by the block transfer service.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.network.sasl.serverAlwaysEncrypt</code></td>
+  <td>false</td>
+  <td>
+    Disable unencrypted connections for services that support SASL authentication. This is
+    currently supported by the external shuffle service.
+  </td>
+</tr>
 <tr>
   <td><code>spark.core.connection.ack.wait.timeout</code></td>
   <td>60s</td>
@@ -1327,7 +1323,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>Empty</td>
   <td>
     Comma separated list of users that have modify access to the Spark job. By default only the
-    user that started the Spark job has access to modify it (kill it for example).
+    user that started the Spark job has access to modify it (kill it for example). Putting a "*" in
+    the list means any user can have access to modify it.
   </td>
 </tr>
 <tr>
@@ -1349,7 +1346,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>Empty</td>
   <td>
     Comma separated list of users that have view access to the Spark web ui. By default only the
-    user that started the Spark job has view access.
+    user that started the Spark job has view access. Putting a "*" in the list means any user can
+    have view access to this Spark job.
   </td>
 </tr>
 </table>
@@ -1437,6 +1435,19 @@ Apart from these, the following properties are also available, and may be useful
 #### Spark Streaming
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.streaming.backpressure.enabled</code></td>
+  <td>false</td>
+  <td>
+    Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5).
+    This enables the Spark Streaming to control the receiving rate based on the
+    current batch scheduling delays and processing times so that the system receives
+    only as fast as the system can process. Internally, this dynamically sets the
+    maximum receiving rate of receivers. This rate is upper bounded by the values
+    <code>spark.streaming.receiver.maxRate</code> and <code>spark.streaming.kafka.maxRatePerPartition</code>
+    if they are set (see below).
+  </td>
+</tr>
 <tr>
   <td><code>spark.streaming.blockInterval</code></td>
   <td>200ms</td>
@@ -1479,6 +1490,14 @@ Apart from these, the following properties are also available, and may be useful
     higher memory usage in Spark.
   </td>
 </tr>
+<tr>
+  <td><code>spark.streaming.stopGracefullyOnShutdown</code></td>
+  <td>false</td>
+  <td>
+    If <code>true</code>, Spark shuts down the <code>StreamingContext</code> gracefully on JVM 
+    shutdown rather than immediately.
+  </td>
+</tr>
 <tr>
   <td><code>spark.streaming.kafka.maxRatePerPartition</code></td>
   <td>not set</td>
@@ -1547,15 +1566,15 @@ The following variables can be set in `spark-env.sh`:
   <tr><th style="width:21%">Environment Variable</th><th>Meaning</th></tr>
   <tr>
     <td><code>JAVA_HOME</code></td>
-    <td>Location where Java is installed (if it's not on your default `PATH`).</td>
+    <td>Location where Java is installed (if it's not on your default <code>PATH</code>).</td>
   </tr>
   <tr>
     <td><code>PYSPARK_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in both driver and workers (default is `python`).</td>
+    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python</code>).</td>
   </tr>
   <tr>
     <td><code>PYSPARK_DRIVER_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in driver only (default is PYSPARK_PYTHON).</td>
+    <td>Python binary executable to use for PySpark in driver only (default is <code>PYSPARK_PYTHON</code>).</td>
   </tr>
   <tr>
     <td><code>SPARK_LOCAL_IP</code></td>
@@ -1585,4 +1604,3 @@ Spark uses [log4j](http://logging.apache.org/log4j/) for logging. You can config
 To specify a different configuration directory other than the default "SPARK_HOME/conf",
 you can set SPARK_CONF_DIR. Spark will use the the configuration files (spark-defaults.conf, spark-env.sh, log4j.properties, etc)
 from this directory.
-
diff --git a/docs/index.md b/docs/index.md
index d85cf12defefd..c0dc2b8d7412a 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -90,7 +90,6 @@ options for deployment:
   * [Spark SQL and DataFrames](sql-programming-guide.html): support for structured data and relational queries
   * [MLlib](mllib-guide.html): built-in machine learning library
   * [GraphX](graphx-programming-guide.html): Spark's new API for graph processing
-  * [Bagel (Pregel on Spark)](bagel-programming-guide.html): older, simple graph processing model
 
 **API Docs:**
 
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index 62749909e01dc..58f566c9b4b55 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -121,10 +121,9 @@ import org.apache.spark.ml.classification.RandomForestClassifier
 import org.apache.spark.ml.classification.RandomForestClassificationModel
 import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -193,14 +192,11 @@ import org.apache.spark.ml.classification.RandomForestClassifier;
 import org.apache.spark.ml.classification.RandomForestClassificationModel;
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
 import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -268,10 +264,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.classification import RandomForestClassifier
 from pyspark.ml.feature import StringIndexer, VectorIndexer
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Index labels, adding metadata to the label column.
 # Fit on whole dataset to include all labels in index.
@@ -327,10 +322,9 @@ import org.apache.spark.ml.regression.RandomForestRegressor
 import org.apache.spark.ml.regression.RandomForestRegressionModel
 import org.apache.spark.ml.feature.VectorIndexer
 import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -387,14 +381,11 @@ import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.ml.regression.RandomForestRegressionModel;
 import org.apache.spark.ml.regression.RandomForestRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -450,10 +441,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.regression import RandomForestRegressor
 from pyspark.ml.feature import VectorIndexer
 from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Automatically identify categorical features, and index them.
 # Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -576,10 +566,9 @@ import org.apache.spark.ml.classification.GBTClassifier
 import org.apache.spark.ml.classification.GBTClassificationModel
 import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -648,14 +637,10 @@ import org.apache.spark.ml.classification.GBTClassifier;
 import org.apache.spark.ml.classification.GBTClassificationModel;
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
 import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Index labels, adding metadata to the label column.
 // Fit on whole dataset to include all labels in index.
@@ -724,10 +709,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.classification import GBTClassifier
 from pyspark.ml.feature import StringIndexer, VectorIndexer
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Index labels, adding metadata to the label column.
 # Fit on whole dataset to include all labels in index.
@@ -783,10 +767,9 @@ import org.apache.spark.ml.regression.GBTRegressor
 import org.apache.spark.ml.regression.GBTRegressionModel
 import org.apache.spark.ml.feature.VectorIndexer
 import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file, converting it to a DataFrame.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -844,14 +827,10 @@ import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
 import org.apache.spark.ml.regression.GBTRegressionModel;
 import org.apache.spark.ml.regression.GBTRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 
 // Load and parse the data file, converting it to a DataFrame.
-RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
-DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
 
 // Automatically identify categorical features, and index them.
 // Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -908,10 +887,9 @@ from pyspark.ml import Pipeline
 from pyspark.ml.regression import GBTRegressor
 from pyspark.ml.feature import VectorIndexer
 from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
 
 # Load and parse the data file, converting it to a DataFrame.
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 # Automatically identify categorical features, and index them.
 # Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -970,15 +948,14 @@ Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifie
 {% highlight scala %}
 import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{Row, SQLContext}
 
 val sqlContext = new SQLContext(sc)
 
 // parse data into dataframe
-val data = MLUtils.loadLibSVMFile(sc, 
-  "data/mllib/sample_multiclass_classification_data.txt")
-val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3))
+val data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_multiclass_classification_data.txt")
+val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
 
 // instantiate multiclass learner and train
 val ovr = new OneVsRest().setClassifier(new LogisticRegression)
@@ -1016,9 +993,6 @@ import org.apache.spark.ml.classification.OneVsRest;
 import org.apache.spark.ml.classification.OneVsRestModel;
 import org.apache.spark.mllib.evaluation.MulticlassMetrics;
 import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
 
@@ -1026,10 +1000,9 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
 JavaSparkContext jsc = new JavaSparkContext(conf);
 SQLContext jsql = new SQLContext(jsc);
 
-RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
-  "data/mllib/sample_multiclass_classification_data.txt");
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_multiclass_classification_data.txt");
 
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
 DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
 DataFrame train = splits[0];
 DataFrame test = splits[1];
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 90654d1e5a248..44a98829393e7 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -28,12 +28,15 @@ The algorithm combines Term Frequency (TF) counts with the [hashing trick](http:
 **IDF**: `IDF` is an `Estimator` which fits on a dataset and produces an `IDFModel`.  The `IDFModel` takes feature vectors (generally created from `HashingTF`) and scales each column.  Intuitively, it down-weights columns which appear frequently in a corpus.
 
 Please refer to the [MLlib user guide on TF-IDF](mllib-feature-extraction.html#tf-idf) for more details on Term Frequency and Inverse Document Frequency.
-For API details, refer to the [HashingTF API docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and the [IDF API docs](api/scala/index.html#org.apache.spark.ml.feature.IDF).
 
 In the following code segment, we start with a set of sentences.  We split each sentence into words using `Tokenizer`.  For each sentence (bag of words), we use `HashingTF` to hash the sentence into a feature vector.  We use `IDF` to rescale the feature vectors; this generally improves performance when using text as features.  Our feature vectors could then be passed to a learning algorithm.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
+the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
 
@@ -54,6 +57,10 @@ rescaledData.select("features", "label").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
+[IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -100,6 +107,10 @@ for (Row r : rescaledData.select("features", "label").take(3)) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
+the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer
 
@@ -123,12 +134,21 @@ for features_label in rescaledData.select("features", "label").take(3):
 
 ## Word2Vec
 
-`Word2Vec` is an `Estimator` which takes sequences of words that represents documents and trains a `Word2VecModel`. The model is a `Map(String, Vector)` essentially, which maps each word to an unique fix-sized vector. The `Word2VecModel` transforms each documents into a vector using the average of all words in the document, which aims to other computations of documents such as similarity calculation consequencely. Please refer to the [MLlib user guide on Word2Vec](mllib-feature-extraction.html#Word2Vec) for more details on Word2Vec.
+`Word2Vec` is an `Estimator` which takes sequences of words representing documents and trains a
+`Word2VecModel`. The model maps each word to a unique fixed-size vector. The `Word2VecModel`
+transforms each document into a vector using the average of all words in the document; this vector
+can then be used for as features for prediction, document similarity calculations, etc.
+Please refer to the [MLlib user guide on Word2Vec](mllib-feature-extraction.html#Word2Vec) for more
+details.
 
-Word2Vec is implemented in [Word2Vec](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec). In the following code segment, we start with a set of documents, each of them is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.
+In the following code segment, we start with a set of documents, each of which is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Word2Vec
 
@@ -152,6 +172,10 @@ result.select("result").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -192,6 +216,10 @@ for (Row r: result.select("result").take(3)) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Word2Vec
 
@@ -250,9 +278,11 @@ each vector represents the token counts of the document over the vocabulary.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-More details can be found in the API docs for
-[CountVectorizer](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer) and
-[CountVectorizerModel](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel).
+
+Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer)
+and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.CountVectorizer
 import org.apache.spark.mllib.util.CountVectorizerModel
@@ -280,9 +310,11 @@ cvModel.transform(df).select("features").show()
 </div>
 
 <div data-lang="java" markdown="1">
-More details can be found in the API docs for
-[CountVectorizer](api/java/org/apache/spark/ml/feature/CountVectorizer.html) and
-[CountVectorizerModel](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html).
+
+Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/CountVectorizer.html)
+and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
+for more details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.CountVectorizer;
@@ -334,6 +366,11 @@ cvModel.transform(df).show();
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
+and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
 
@@ -356,6 +393,11 @@ regexTokenized.select("words", "label").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Tokenizer Java docs](api/java/org/apache/spark/ml/feature/Tokenizer.html)
+and the [RegexTokenizer Java docs](api/java/org/apache/spark/ml/feature/RegexTokenizer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -397,6 +439,11 @@ RegexTokenizer regexTokenizer = new RegexTokenizer()
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Tokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer) and
+the the [RegexTokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Tokenizer, RegexTokenizer
 
@@ -426,7 +473,8 @@ words from the input sequences. The list of stopwords is specified by
 the `stopWords` parameter.  We provide [a list of stop
 words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by
 default, accessible by calling `getStopWords` on a newly instantiated
-`StopWordsRemover` instance.
+`StopWordsRemover` instance. A boolean parameter `caseSensitive` indicates
+if the matches should be case sensitive (false by default).
 
 **Examples**
 
@@ -456,10 +504,8 @@ filtered out.
 
 <div data-lang="scala" markdown="1">
 
-[`StopWordsRemover`](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
-takes an input column name, an output column name, a list of stop words,
-and a boolean indicating if the matches should be case sensitive (false
-by default).
+Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.StopWordsRemover
@@ -478,10 +524,8 @@ remover.transform(dataSet).show()
 
 <div data-lang="java" markdown="1">
 
-[`StopWordsRemover`](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
-takes an input column name, an output column name, a list of stop words,
-and a boolean indicating if the matches should be case sensitive (false
-by default).
+Refer to the [StopWordsRemover Java docs](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -512,6 +556,24 @@ DataFrame dataset = jsql.createDataFrame(rdd, schema);
 remover.transform(dataset).show();
 {% endhighlight %}
 </div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [StopWordsRemover Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
+for more details on the API.
+
+{% highlight python %}
+from pyspark.ml.feature import StopWordsRemover
+
+sentenceData = sqlContext.createDataFrame([
+  (0, ["I", "saw", "the", "red", "baloon"]),
+  (1, ["Mary", "had", "a", "little", "lamb"])
+], ["label", "raw"])
+
+remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
+remover.transform(sentenceData).show(truncate=False)
+{% endhighlight %}
+</div>
 </div>
 
 ## $n$-gram
@@ -524,7 +586,8 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
 
 <div data-lang="scala" markdown="1">
 
-[`NGram`](api/scala/index.html#org.apache.spark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.NGram
@@ -543,7 +606,8 @@ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(pri
 
 <div data-lang="java" markdown="1">
 
-[`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+Refer to the [NGram Java docs](api/java/org/apache/spark/ml/feature/NGram.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -581,7 +645,8 @@ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
 
 <div data-lang="python" markdown="1">
 
-[`NGram`](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+Refer to the [NGram Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.NGram)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import NGram
@@ -602,12 +667,16 @@ for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
 
 ## Binarizer
 
-Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution), a binarizer is useful for pre-processing the input data with continuous numerical features.
+Binarization is the process of thresholding numerical features to binary (0/1) features.
 
-A simple [Binarizer](api/scala/index.html#org.apache.spark.ml.feature.Binarizer) class provides this functionality. Besides the common parameters of `inputCol` and `outputCol`, `Binarizer` has the parameter `threshold` used for binarizing continuous numerical features. The features greater than the threshold, will be binarized to 1.0. The features equal to or less than the threshold, will be binarized to 0.0. The example below shows how to binarize numerical features.
+`Binarizer` takes the common parameters `inputCol` and `outputCol`, as well as the `threshold` for binarization. Feature values greater than the threshold are binarized to 1.0; values equal to or less than the threshold are binarized to 0.0.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Binarizer
 import org.apache.spark.sql.DataFrame
@@ -631,6 +700,10 @@ binarizedFeatures.collect().foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Binarizer Java docs](api/java/org/apache/spark/ml/feature/Binarizer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -668,6 +741,10 @@ for (Row r : binarizedFeatures.collect()) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Binarizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Binarizer
 
@@ -691,7 +768,10 @@ for binarized_feature, in binarizedFeatures.collect():
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details.
+
+Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.PCA
 import org.apache.spark.mllib.linalg.Vectors
@@ -714,7 +794,10 @@ result.show()
 </div>
 
 <div data-lang="java" markdown="1">
-See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details.
+
+Refer to the [PCA Java docs](api/java/org/apache/spark/ml/feature/PCA.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -754,7 +837,10 @@ result.show();
 </div>
 
 <div data-lang="python" markdown="1">
-See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details.
+
+Refer to the [PCA Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PCA)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import PCA
 from pyspark.mllib.linalg import Vectors
@@ -777,6 +863,10 @@ result.show(truncate=False)
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.PolynomialExpansion
 import org.apache.spark.mllib.linalg.Vectors
@@ -797,6 +887,10 @@ polyDF.select("polyFeatures").take(3).foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [PolynomialExpansion Java docs](api/java/org/apache/spark/ml/feature/PolynomialExpansion.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -837,6 +931,10 @@ for (Row r : row) {
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [PolynomialExpansion Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import PolynomialExpansion
 from pyspark.mllib.linalg import Vectors
@@ -870,6 +968,10 @@ $0$th DCT coefficient and _not_ the $N/2$th).
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.DCT
 import org.apache.spark.mllib.linalg.Vectors
@@ -889,6 +991,10 @@ dctDf.select("featuresDCT").show(3)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [DCT Java docs](api/java/org/apache/spark/ml/feature/DCT.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -973,8 +1079,8 @@ index `2`.
 
 <div data-lang="scala" markdown="1">
 
-[`StringIndexer`](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) takes an input
-column name and an output column name.
+Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.StringIndexer
@@ -991,8 +1097,9 @@ indexed.show()
 </div>
 
 <div data-lang="java" markdown="1">
-[`StringIndexer`](api/java/org/apache/spark/ml/feature/StringIndexer.html) takes an input column
-name and an output column name.
+
+Refer to the [StringIndexer Java docs](api/java/org/apache/spark/ml/feature/StringIndexer.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1029,8 +1136,8 @@ indexed.show();
 
 <div data-lang="python" markdown="1">
 
-[`StringIndexer`](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer) takes an input
-column name and an output column name.
+Refer to the [StringIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import StringIndexer
@@ -1051,6 +1158,10 @@ indexed.show()
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
 
@@ -1077,6 +1188,10 @@ encoded.select("id", "categoryVec").foreach(println)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1119,6 +1234,10 @@ DataFrame encoded = encoder.transform(indexed);
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import OneHotEncoder, StringIndexer
 
@@ -1152,17 +1271,19 @@ It can both automatically decide which features are categorical and convert orig
 
 Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance.
 
-Please refer to the [VectorIndexer API docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer) for more details.
-
 In the example below, we read in a dataset of labeled points and then use `VectorIndexer` to decide which features should be treated as categorical.  We transform the categorical feature values to their indices.  This transformed data could then be passed to algorithms such as `DecisionTreeRegressor` that handle categorical features.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.VectorIndexer
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 val indexer = new VectorIndexer()
   .setInputCol("features")
   .setOutputCol("indexed")
@@ -1178,19 +1299,19 @@ val indexedData = indexerModel.transform(data)
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [VectorIndexer Java docs](api/java/org/apache/spark/ml/feature/VectorIndexer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Map;
 
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.VectorIndexer;
 import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(),
-  "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 VectorIndexer indexer = new VectorIndexer()
   .setInputCol("features")
   .setOutputCol("indexed")
@@ -1209,11 +1330,15 @@ DataFrame indexedData = indexerModel.transform(data);
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [VectorIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import VectorIndexer
-from pyspark.mllib.util import MLUtils
 
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
 indexerModel = indexer.fit(data)
 
@@ -1232,12 +1357,15 @@ The following example demonstrates how to load a dataset in libsvm format and th
 
 <div class="codetabs">
 <div data-lang="scala">
+
+Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Normalizer
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 
 // Normalize each Vector using $L^1$ norm.
 val normalizer = new Normalizer()
@@ -1252,16 +1380,16 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
 </div>
 
 <div data-lang="java">
+
+Refer to the [Normalizer Java docs](api/java/org/apache/spark/ml/feature/Normalizer.html)
+for more details on the API.
+
 {% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.Normalizer;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> data =
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 
 // Normalize each Vector using $L^1$ norm.
 Normalizer normalizer = new Normalizer()
@@ -1277,12 +1405,15 @@ DataFrame lInfNormData =
 </div>
 
 <div data-lang="python">
+
+Refer to the [Normalizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer)
+for more details on the API.
+
 {% highlight python %}
-from pyspark.mllib.util import MLUtils
 from pyspark.ml.feature import Normalizer
 
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-dataFrame = sqlContext.createDataFrame(data)
+dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 
 # Normalize each Vector using $L^1$ norm.
 normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
@@ -1306,20 +1437,19 @@ lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
 
 Note that if the standard deviation of a feature is zero, it will return default `0.0` value in the `Vector` for that feature.
 
-More details can be found in the API docs for
-[StandardScaler](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler) and
-[StandardScalerModel](api/scala/index.html#org.apache.spark.ml.feature.StandardScalerModel).
-
 The following example demonstrates how to load a dataset in libsvm format and then normalize each feature to have unit standard deviation.
 
 <div class="codetabs">
 <div data-lang="scala">
+
+Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.StandardScaler
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 val scaler = new StandardScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures")
@@ -1335,17 +1465,17 @@ val scaledData = scalerModel.transform(dataFrame)
 </div>
 
 <div data-lang="java">
+
+Refer to the [StandardScaler Java docs](api/java/org/apache/spark/ml/feature/StandardScaler.html)
+for more details on the API.
+
 {% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.StandardScaler;
 import org.apache.spark.ml.feature.StandardScalerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> data =
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 StandardScaler scaler = new StandardScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures")
@@ -1361,12 +1491,15 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
 </div>
 
 <div data-lang="python">
+
+Refer to the [StandardScaler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler)
+for more details on the API.
+
 {% highlight python %}
-from pyspark.mllib.util import MLUtils
 from pyspark.ml.feature import StandardScaler
 
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-dataFrame = sqlContext.createDataFrame(data)
+dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                         withStd=True, withMean=False)
 
@@ -1400,15 +1533,16 @@ The following example demonstrates how to load a dataset in libsvm format and th
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-More details can be found in the API docs for
-[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and
-[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
+
+Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler)
+and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.MinMaxScaler
-import org.apache.spark.mllib.util.MLUtils
 
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt")
 val scaler = new MinMaxScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures")
@@ -1422,20 +1556,19 @@ val scaledData = scalerModel.transform(dataFrame)
 </div>
 
 <div data-lang="java" markdown="1">
-More details can be found in the API docs for
-[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and
-[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html).
+
+Refer to the [MinMaxScaler Java docs](api/java/org/apache/spark/ml/feature/MinMaxScaler.html)
+and the [MinMaxScalerModel Java docs](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html)
+for more details on the API.
+
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.MinMaxScaler;
 import org.apache.spark.ml.feature.MinMaxScalerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
 
-JavaRDD<LabeledPoint> data =
-  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+  .load("data/mllib/sample_libsvm_data.txt");
 MinMaxScaler scaler = new MinMaxScaler()
   .setInputCol("features")
   .setOutputCol("scaledFeatures");
@@ -1465,6 +1598,10 @@ The following example demonstrates how to bucketize a column of `Double`s into a
 
 <div class="codetabs">
 <div data-lang="scala">
+
+Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.Bucketizer
 import org.apache.spark.sql.DataFrame
@@ -1485,6 +1622,10 @@ val bucketedData = bucketizer.transform(dataFrame)
 </div>
 
 <div data-lang="java">
+
+Refer to the [Bucketizer Java docs](api/java/org/apache/spark/ml/feature/Bucketizer.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1520,6 +1661,10 @@ DataFrame bucketedData = bucketizer.transform(dataFrame);
 </div>
 
 <div data-lang="python">
+
+Refer to the [Bucketizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import Bucketizer
 
@@ -1556,14 +1701,14 @@ v_N
   \end{pmatrix}
 \]`
 
-[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct) takes the following parameter:
-
-* `scalingVec`: the transforming vector.
-
 This example below demonstrates how to transform vectors using a transforming vector value.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
+Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct)
+for more details on the API.
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
@@ -1586,6 +1731,10 @@ transformer.transform(dataFrame).show()
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [ElementwiseProduct Java docs](api/java/org/apache/spark/ml/feature/ElementwiseProduct.html)
+for more details on the API.
+
 {% highlight java %}
 import java.util.Arrays;
 
@@ -1624,6 +1773,10 @@ transformer.transform(dataFrame).show();
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [ElementwiseProduct Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct)
+for more details on the API.
+
 {% highlight python %}
 from pyspark.ml.feature import ElementwiseProduct
 from pyspark.mllib.linalg import Vectors
@@ -1677,8 +1830,8 @@ output column to `features`, after transformation we should get the following Da
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`VectorAssembler`](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler) takes an array
-of input column names and an output column name.
+Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
@@ -1697,8 +1850,8 @@ println(output.select("features", "clicked").first())
 
 <div data-lang="java" markdown="1">
 
-[`VectorAssembler`](api/java/org/apache/spark/ml/feature/VectorAssembler.html) takes an array
-of input column names and an output column name.
+Refer to the [VectorAssembler Java docs](api/java/org/apache/spark/ml/feature/VectorAssembler.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1734,8 +1887,8 @@ System.out.println(output.select("features", "clicked").first());
 
 <div data-lang="python" markdown="1">
 
-[`VectorAssembler`](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) takes a list
-of input column names and an output column name.
+Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.mllib.linalg import Vectors
@@ -1811,8 +1964,8 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`VectorSlicer`](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer) takes an input
-column name with specified indices or names and an output column name.
+Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.Vectors
@@ -1845,8 +1998,8 @@ println(output.select("userFeatures", "features").first())
 
 <div data-lang="java" markdown="1">
 
-[`VectorSlicer`](api/java/org/apache/spark/ml/feature/VectorSlicer.html) takes an input column name
-with specified indices or names and an output column name.
+Refer to the [VectorSlicer Java docs](api/java/org/apache/spark/ml/feature/VectorSlicer.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1916,7 +2069,8 @@ id | country | hour | clicked | features         | label
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
+Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula)
+for more details on the API.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.RFormula
@@ -1937,7 +2091,8 @@ output.select("features", "label").show()
 
 <div data-lang="java" markdown="1">
 
-[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns.
+Refer to the [RFormula Java docs](api/java/org/apache/spark/ml/feature/RFormula.html)
+for more details on the API.
 
 {% highlight java %}
 import java.util.Arrays;
@@ -1975,7 +2130,8 @@ output.select("features", "label").show();
 
 <div data-lang="python" markdown="1">
 
-[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
+Refer to the [RFormula Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula)
+for more details on the API.
 
 {% highlight python %}
 from pyspark.ml.feature import RFormula
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index ce53400b6ee56..fd3a6167bc65e 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -24,61 +24,88 @@ title: Spark ML Programming Guide
 The `spark.ml` package aims to provide a uniform set of high-level APIs built on top of
 [DataFrames](sql-programming-guide.html#dataframes) that help users create and tune practical
 machine learning pipelines.
-See the [Algorithm Guides section](#algorithm-guides) below for guides on sub-packages of
+See the [algorithm guides](#algorithm-guides) section below for guides on sub-packages of
 `spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more.
 
-**Table of Contents**
+**Table of contents**
 
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-# Main Concepts
+# Algorithm guides
 
-Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow.  This section covers the key concepts introduced by the Spark ML API.
+We provide several algorithm guides specific to the Pipelines API.
+Several of these algorithms, such as certain feature transformers, are not in the `spark.mllib` API.
+Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., random forests
+provide class probabilities, and linear models provide model summaries.
 
-* **[ML Dataset](ml-guide.html#ml-dataset)**: Spark ML uses the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL as a dataset which can hold a variety of data types.
-E.g., a dataset could have different columns storing text, feature vectors, true labels, and predictions.
+* [Feature extraction, transformation, and selection](ml-features.html)
+* [Decision Trees for classification and regression](ml-decision-tree.html)
+* [Ensembles](ml-ensembles.html)
+* [Linear methods with elastic net regularization](ml-linear-methods.html)
+* [Multilayer perceptron classifier](ml-ann.html)
+
+
+# Main concepts in Pipelines
+
+Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple
+algorithms into a single pipeline, or workflow.
+This section covers the key concepts introduced by the Spark ML API, where the pipeline concept is
+mostly inspired by the [scikit-learn](http://scikit-learn.org/) project.
+
+* **[`DataFrame`](ml-guide.html#dataframe)**: Spark ML uses `DataFrame` from Spark SQL as an ML
+  dataset, which can hold a variety of data types.
+  E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions.
 
 * **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
-E.g., an ML model is a `Transformer` which transforms an RDD with features into an RDD with predictions.
+E.g., an ML model is a `Transformer` which transforms `DataFrame` with features into a `DataFrame` with predictions.
 
 * **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
-E.g., a learning algorithm is an `Estimator` which trains on a dataset and produces a model.
+E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model.
 
 * **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
 
-* **[`Param`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
+* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
 
-## ML Dataset
+## DataFrame
 
 Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data.
-Spark ML adopts the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL in order to support a variety of data types under a unified Dataset concept.
+Spark ML adopts the `DataFrame` from Spark SQL in order to support a variety of data types.
 
 `DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
-In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) types.
+In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types.
 
 A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
 
 Columns in a `DataFrame` are named.  The code examples below use names such as "text," "features," and "label."
 
-## ML Algorithms
+## Pipeline components
 
 ### Transformers
 
-A [`Transformer`](api/scala/index.html#org.apache.spark.ml.Transformer) is an abstraction which includes feature transformers and learned models.  Technically, a `Transformer` implements a method `transform()` which converts one `DataFrame` into another, generally by appending one or more columns.
+A `Transformer` is an abstraction that includes feature transformers and learned models.
+Technically, a `Transformer` implements a method `transform()`, which converts one `DataFrame` into
+another, generally by appending one or more columns.
 For example:
 
-* A feature transformer might take a dataset, read a column (e.g., text), convert it into a new column (e.g., feature vectors), append the new column to the dataset, and output the updated dataset.
-* A learning model might take a dataset, read the column containing feature vectors, predict the label for each feature vector, append the labels as a new column, and output the updated dataset.
+* A feature transformer might take a `DataFrame`, read a column (e.g., text), map it into a new
+  column (e.g., feature vectors), and output a new `DataFrame` with the mapped column appended.
+* A learning model might take a `DataFrame`, read the column containing feature vectors, predict the
+  label for each feature vector, and output a new `DataFrame` with predicted labels appended as a
+  column.
 
 ### Estimators
 
-An [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator) abstracts the concept of a learning algorithm or any algorithm which fits or trains on data.  Technically, an `Estimator` implements a method `fit()` which accepts a `DataFrame` and produces a `Transformer`.
-For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling `fit()` trains a `LogisticRegressionModel`, which is a `Transformer`.
+An `Estimator` abstracts the concept of a learning algorithm or any algorithm that fits or trains on
+data.
+Technically, an `Estimator` implements a method `fit()`, which accepts a `DataFrame` and produces a
+`Model`, which is a `Transformer`.
+For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling
+`fit()` trains a `LogisticRegressionModel`, which is a `Model` and hence a `Transformer`.
 
-### Properties of ML Algorithms
+### Properties of pipeline components
 
-`Transformer`s and `Estimator`s are both stateless.  In the future, stateful algorithms may be supported via alternative concepts.
+`Transformer.transform()`s and `Estimator.fit()`s are both stateless.  In the future, stateful algorithms may be supported via alternative concepts.
 
 Each instance of a `Transformer` or `Estimator` has a unique ID, which is useful in specifying parameters (discussed below).
 
@@ -91,15 +118,16 @@ E.g., a simple text document processing workflow might include several stages:
 * Convert each document's words into a numerical feature vector.
 * Learn a prediction model using the feature vectors and labels.
 
-Spark ML represents such a workflow as a [`Pipeline`](api/scala/index.html#org.apache.spark.ml.Pipeline),
-which consists of a sequence of [`PipelineStage`s](api/scala/index.html#org.apache.spark.ml.PipelineStage) (`Transformer`s and `Estimator`s) to be run in a specific order.  We will use this simple workflow as a running example in this section.
+Spark ML represents such a workflow as a `Pipeline`, which consists of a sequence of
+`PipelineStage`s (`Transformer`s and `Estimator`s) to be run in a specific order.
+We will use this simple workflow as a running example in this section.
 
-### How It Works
+### How it works
 
 A `Pipeline` is specified as a sequence of stages, and each stage is either a `Transformer` or an `Estimator`.
-These stages are run in order, and the input dataset is modified as it passes through each stage.
-For `Transformer` stages, the `transform()` method is called on the dataset.
-For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the dataset.
+These stages are run in order, and the input `DataFrame` is transformed as it passes through each stage.
+For `Transformer` stages, the `transform()` method is called on the `DataFrame`.
+For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the `DataFrame`.
 
 We illustrate this for the simple text document workflow.  The figure below is for the *training time* usage of a `Pipeline`.
 
@@ -115,14 +143,17 @@ We illustrate this for the simple text document workflow.  The figure below is f
 Above, the top row represents a `Pipeline` with three stages.
 The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red).
 The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s.
-The `Pipeline.fit()` method is called on the original dataset which has raw text documents and labels.
-The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words into the dataset.
-The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the dataset.
+The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw text documents and labels.
+The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`.
+The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`.
 Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`.
-If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()` method on the dataset before passing the dataset to the next stage.
+If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()`
+method on the `DataFrame` before passing the `DataFrame` to the next stage.
 
 A `Pipeline` is an `Estimator`.
-Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel` which is a `Transformer`.  This `PipelineModel` is used at *test time*; the figure below illustrates this usage.
+Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel`, which is a
+`Transformer`.
+This `PipelineModel` is used at *test time*; the figure below illustrates this usage.
 
 <p style="text-align: center;">
   <img
@@ -134,7 +165,8 @@ Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel` wh
 </p>
 
 In the figure above, the `PipelineModel` has the same number of stages as the original `Pipeline`, but all `Estimator`s in the original `Pipeline` have become `Transformer`s.
-When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed through the `Pipeline` in order.
+When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed
+through the fitted pipeline in order.
 Each stage's `transform()` method updates the dataset and passes it to the next stage.
 
 `Pipeline`s and `PipelineModel`s help to ensure that training and test data go through identical feature processing steps.
@@ -143,40 +175,43 @@ Each stage's `transform()` method updates the dataset and passes it to the next
 
 *DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array.  The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage.  It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG).  This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters).  If the `Pipeline` forms a DAG, then the stages must be specified in topological order.
 
-*Runtime checking*: Since `Pipeline`s can operate on datasets with varied types, they cannot use compile-time type checking.  `Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.  This type checking is done using the dataset *schema*, a description of the data types of columns in the `DataFrame`.
+*Runtime checking*: Since `Pipeline`s can operate on `DataFrame`s with varied types, they cannot use
+compile-time type checking.
+`Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.
+This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`.
+
+*Unique Pipeline stages*: A `Pipeline`'s stages should be unique instances.  E.g., the same instance
+`myHashingTF` should not be inserted into the `Pipeline` twice since `Pipeline` stages must have
+unique IDs.  However, different instances `myHashingTF1` and `myHashingTF2` (both of type `HashingTF`)
+can be put into the same `Pipeline` since different instances will be created with different IDs.
 
 ## Parameters
 
 Spark ML `Estimator`s and `Transformer`s use a uniform API for specifying parameters.
 
-A [`Param`](api/scala/index.html#org.apache.spark.ml.param.Param) is a named parameter with self-contained documentation.
-A [`ParamMap`](api/scala/index.html#org.apache.spark.ml.param.ParamMap) is a set of (parameter, value) pairs.
+A `Param` is a named parameter with self-contained documentation.
+A `ParamMap` is a set of (parameter, value) pairs.
 
 There are two main ways to pass parameters to an algorithm:
 
-1. Set parameters for an instance.  E.g., if `lr` is an instance of `LogisticRegression`, one could call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations.  This API resembles the API used in MLlib.
+1. Set parameters for an instance.  E.g., if `lr` is an instance of `LogisticRegression`, one could
+   call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations.
+   This API resembles the API used in `spark.mllib` package.
 2. Pass a `ParamMap` to `fit()` or `transform()`.  Any parameters in the `ParamMap` will override parameters previously specified via setter methods.
 
 Parameters belong to specific instances of `Estimator`s and `Transformer`s.
 For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`.
 This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`.
 
-# Algorithm Guides
-
-There are now several algorithms in the Pipelines API which are not in the `spark.mllib` API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
-
-**Pipelines API Algorithm Guides**
-
-* [Feature Extraction, Transformation, and Selection](ml-features.html)
-* [Decision Trees for Classification and Regression](ml-decision-tree.html)
-* [Ensembles](ml-ensembles.html)
-* [Linear methods with elastic net regularization](ml-linear-methods.html)
-* [Multilayer perceptron classifier](ml-ann.html)
-
-# Code Examples
+# Code examples
 
 This section gives code examples illustrating the functionality discussed above.
-There is not yet documentation for specific algorithms in Spark ML.  For more info, please refer to the [API Documentation](api/scala/index.html#org.apache.spark.ml.package).  Spark ML algorithms are currently wrappers for MLlib algorithms, and the [MLlib programming guide](mllib-guide.html) has details on specific algorithms.
+For more info, please refer to the API documentation
+([Scala](api/scala/index.html#org.apache.spark.ml.package),
+[Java](api/java/org/apache/spark/ml/package-summary.html),
+and [Python](api/python/pyspark.ml.html)).
+Some Spark ML algorithms are wrappers for `spark.mllib` algorithms, and the
+[MLlib programming guide](mllib-guide.html) has details on specific algorithms.
 
 ## Example: Estimator, Transformer, and Param
 
@@ -186,26 +221,18 @@ This example covers the concepts of `Estimator`, `Transformer`, and `Param`.
 
 <div data-lang="scala">
 {% highlight scala %}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.Row
 
-val conf = new SparkConf().setAppName("SimpleParamsExample")
-val sc = new SparkContext(conf)
-val sqlContext = new SQLContext(sc)
-import sqlContext.implicits._
-
-// Prepare training data.
-// We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of case classes
-// into DataFrames, where it uses the case class metadata to infer the schema.
-val training = sc.parallelize(Seq(
-  LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
-  LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
-  LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-  LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
+// Prepare training data from a list of (label, features) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (1.0, Vectors.dense(0.0, 1.1, 0.1)),
+  (0.0, Vectors.dense(2.0, 1.0, -1.0)),
+  (0.0, Vectors.dense(2.0, 1.3, 1.0)),
+  (1.0, Vectors.dense(0.0, 1.2, -0.5))
+)).toDF("label", "features")
 
 // Create a LogisticRegression instance.  This instance is an Estimator.
 val lr = new LogisticRegression()
@@ -217,7 +244,7 @@ lr.setMaxIter(10)
   .setRegParam(0.01)
 
 // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-val model1 = lr.fit(training.toDF)
+val model1 = lr.fit(training)
 // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
 // we can view the parameters it used during fit().
 // This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -227,8 +254,8 @@ println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
 // We may alternatively specify parameters using a ParamMap,
 // which supports several methods for specifying parameters.
 val paramMap = ParamMap(lr.maxIter -> 20)
-paramMap.put(lr.maxIter, 30) // Specify 1 Param.  This overwrites the original maxIter.
-paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
+  .put(lr.maxIter, 30) // Specify 1 Param.  This overwrites the original maxIter.
+  .put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
 
 // One can also combine ParamMaps.
 val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
@@ -236,27 +263,27 @@ val paramMapCombined = paramMap ++ paramMap2
 
 // Now learn a new model using the paramMapCombined parameters.
 // paramMapCombined overrides all parameters set earlier via lr.set* methods.
-val model2 = lr.fit(training.toDF, paramMapCombined)
+val model2 = lr.fit(training, paramMapCombined)
 println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
 
 // Prepare test data.
-val test = sc.parallelize(Seq(
-  LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-  LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-  LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
+val test = sqlContext.createDataFrame(Seq(
+  (1.0, Vectors.dense(-1.0, 1.5, 1.3)),
+  (0.0, Vectors.dense(3.0, 2.0, -0.1)),
+  (1.0, Vectors.dense(0.0, 2.2, -1.5))
+)).toDF("label", "features")
 
 // Make predictions on test data using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
 // Note that model2.transform() outputs a 'myProbability' column instead of the usual
 // 'probability' column since we renamed the lr.probabilityCol parameter previously.
-model2.transform(test.toDF)
+model2.transform(test)
   .select("features", "label", "myProbability", "prediction")
   .collect()
   .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
     println(s"($features, $label) -> prob=$prob, prediction=$prediction")
   }
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -265,30 +292,23 @@ sc.stop()
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.Row;
 
-SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
-
 // Prepare training data.
 // We use LabeledPoint, which is a JavaBean.  Spark SQL can convert RDDs of JavaBeans
 // into DataFrames, where it uses the bean metadata to infer the schema.
-List<LabeledPoint> localTraining = Arrays.asList(
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
   new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-  new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
+  new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))
+), LabeledPoint.class);
 
 // Create a LogisticRegression instance.  This instance is an Estimator.
 LogisticRegression lr = new LogisticRegression();
@@ -308,14 +328,14 @@ LogisticRegressionModel model1 = lr.fit(training);
 System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
 
 // We may alternatively specify parameters using a ParamMap.
-ParamMap paramMap = new ParamMap();
-paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
-paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
-paramMap.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
+ParamMap paramMap = new ParamMap()
+  .put(lr.maxIter().w(20)) // Specify 1 Param.
+  .put(lr.maxIter(), 30) // This overwrites the original maxIter.
+  .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
 
 // One can also combine ParamMaps.
-ParamMap paramMap2 = new ParamMap();
-paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name
+ParamMap paramMap2 = new ParamMap()
+  .put(lr.probabilityCol().w("myProbability")); // Change output column name
 ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
 
 // Now learn a new model using the paramMapCombined parameters.
@@ -324,11 +344,11 @@ LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
 System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
 
 // Prepare test documents.
-List<LabeledPoint> localTest = Arrays.asList(
-    new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-    new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-    new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
+  new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
+  new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
+  new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))
+), LabeledPoint.class);
 
 // Make predictions on test documents using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
@@ -340,28 +360,21 @@ for (Row r: results.select("features", "label", "myProbability", "prediction").c
       + ", prediction=" + r.get(3));
 }
 
-jsc.stop();
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
 {% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.linalg import Vectors
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.param import Param, Params
-from pyspark.sql import Row, SQLContext
 
-sc = SparkContext(appName="SimpleParamsExample")
-sqlContext = SQLContext(sc)
-
-# Prepare training data.
-# We use LabeledPoint.
-# Spark SQL can convert RDDs of LabeledPoints into DataFrames.
-training = sc.parallelize([LabeledPoint(1.0, [0.0, 1.1,  0.1]),
-                           LabeledPoint(0.0, [2.0, 1.0, -1.0]),
-                           LabeledPoint(0.0, [2.0, 1.3,  1.0]),
-                           LabeledPoint(1.0, [0.0, 1.2, -0.5])])
+# Prepare training data from a list of (label, features) tuples.
+training = sqlContext.createDataFrame([
+    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
+    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
+    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
+    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
 
 # Create a LogisticRegression instance. This instance is an Estimator.
 lr = LogisticRegression(maxIter=10, regParam=0.01)
@@ -369,7 +382,7 @@ lr = LogisticRegression(maxIter=10, regParam=0.01)
 print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
 
 # Learn a LogisticRegression model. This uses the parameters stored in lr.
-model1 = lr.fit(training.toDF())
+model1 = lr.fit(training)
 
 # Since model1 is a Model (i.e., a transformer produced by an Estimator),
 # we can view the parameters it used during fit().
@@ -390,25 +403,25 @@ paramMapCombined.update(paramMap2)
 
 # Now learn a new model using the paramMapCombined parameters.
 # paramMapCombined overrides all parameters set earlier via lr.set* methods.
-model2 = lr.fit(training.toDF(), paramMapCombined)
+model2 = lr.fit(training, paramMapCombined)
 print "Model 2 was fit using parameters: "
 print model2.extractParamMap()
 
 # Prepare test data
-test = sc.parallelize([LabeledPoint(1.0, [-1.0, 1.5,  1.3]),
-                       LabeledPoint(0.0, [ 3.0, 2.0, -0.1]),
-                       LabeledPoint(1.0, [ 0.0, 2.2, -1.5])])
+test = sqlContext.createDataFrame([
+    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
+    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
+    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
 
 # Make predictions on test data using the Transformer.transform() method.
 # LogisticRegression.transform will only use the 'features' column.
 # Note that model2.transform() outputs a "myProbability" column instead of the usual
 # 'probability' column since we renamed the lr.probabilityCol parameter previously.
-prediction = model2.transform(test.toDF())
+prediction = model2.transform(test)
 selected = prediction.select("features", "label", "myProbability", "prediction")
 for row in selected.collect():
     print row
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -422,30 +435,19 @@ This example follows the simple text document `Pipeline` illustrated in the figu
 
 <div data-lang="scala">
 {% highlight scala %}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.{Row, SQLContext}
-
-// Labeled and unlabeled instance types.
-// Spark SQL can infer schema from case classes.
-case class LabeledDocument(id: Long, text: String, label: Double)
-case class Document(id: Long, text: String)
+import org.apache.spark.sql.Row
 
-// Set up contexts.  Import implicit conversions to DataFrame from sqlContext.
-val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
-val sc = new SparkContext(conf)
-val sqlContext = new SQLContext(sc)
-import sqlContext.implicits._
-
-// Prepare training documents, which are labeled.
-val training = sc.parallelize(Seq(
-  LabeledDocument(0L, "a b c d e spark", 1.0),
-  LabeledDocument(1L, "b d", 0.0),
-  LabeledDocument(2L, "spark f g h", 1.0),
-  LabeledDocument(3L, "hadoop mapreduce", 0.0)))
+// Prepare training documents from a list of (id, text, label) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (0L, "a b c d e spark", 1.0),
+  (1L, "b d", 0.0),
+  (2L, "spark f g h", 1.0),
+  (3L, "hadoop mapreduce", 0.0)
+)).toDF("id", "text", "label")
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 val tokenizer = new Tokenizer()
@@ -462,24 +464,24 @@ val pipeline = new Pipeline()
   .setStages(Array(tokenizer, hashingTF, lr))
 
 // Fit the pipeline to training documents.
-val model = pipeline.fit(training.toDF)
+val model = pipeline.fit(training)
 
-// Prepare test documents, which are unlabeled.
-val test = sc.parallelize(Seq(
-  Document(4L, "spark i j k"),
-  Document(5L, "l m n"),
-  Document(6L, "mapreduce spark"),
-  Document(7L, "apache hadoop")))
+// Prepare test documents, which are unlabeled (id, text) tuples.
+val test = sqlContext.createDataFrame(Seq(
+  (4L, "spark i j k"),
+  (5L, "l m n"),
+  (6L, "mapreduce spark"),
+  (7L, "apache hadoop")
+)).toDF("id", "text")
 
 // Make predictions on test documents.
-model.transform(test.toDF)
+model.transform(test)
   .select("id", "text", "probability", "prediction")
   .collect()
   .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
     println(s"($id, $text) --> prob=$prob, prediction=$prediction")
   }
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -488,8 +490,6 @@ sc.stop()
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineModel;
 import org.apache.spark.ml.PipelineStage;
@@ -498,7 +498,6 @@ import org.apache.spark.ml.feature.HashingTF;
 import org.apache.spark.ml.feature.Tokenizer;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
 
 // Labeled and unlabeled instance types.
 // Spark SQL can infer schema from Java Beans.
@@ -530,18 +529,13 @@ public class LabeledDocument extends Document implements Serializable {
   public void setLabel(double label) { this.label = label; }
 }
 
-// Set up contexts.
-SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
-
 // Prepare training documents, which are labeled.
-List<LabeledDocument> localTraining = Arrays.asList(
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
   new LabeledDocument(0L, "a b c d e spark", 1.0),
   new LabeledDocument(1L, "b d", 0.0),
   new LabeledDocument(2L, "spark f g h", 1.0),
-  new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+  new LabeledDocument(3L, "hadoop mapreduce", 0.0)
+), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -561,12 +555,12 @@ Pipeline pipeline = new Pipeline()
 PipelineModel model = pipeline.fit(training);
 
 // Prepare test documents, which are unlabeled.
-List<Document> localTest = Arrays.asList(
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
   new Document(4L, "spark i j k"),
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
-  new Document(7L, "apache hadoop"));
-DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+  new Document(7L, "apache hadoop")
+), Document.class);
 
 // Make predictions on test documents.
 DataFrame predictions = model.transform(test);
@@ -575,28 +569,23 @@ for (Row r: predictions.select("id", "text", "probability", "prediction").collec
       + ", prediction=" + r.get(3));
 }
 
-jsc.stop();
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
 {% highlight python %}
-from pyspark import SparkContext
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.feature import HashingTF, Tokenizer
-from pyspark.sql import Row, SQLContext
-
-sc = SparkContext(appName="SimpleTextClassificationPipeline")
-sqlContext = SQLContext(sc)
+from pyspark.sql import Row
 
-# Prepare training documents, which are labeled.
+# Prepare training documents from a list of (id, text, label) tuples.
 LabeledDocument = Row("id", "text", "label")
-training = sc.parallelize([(0L, "a b c d e spark", 1.0),
-                           (1L, "b d", 0.0),
-                           (2L, "spark f g h", 1.0),
-                           (3L, "hadoop mapreduce", 0.0)]) \
-    .map(lambda x: LabeledDocument(*x)).toDF()
+training = sqlContext.createDataFrame([
+    (0L, "a b c d e spark", 1.0),
+    (1L, "b d", 0.0),
+    (2L, "spark f g h", 1.0),
+    (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
 
 # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
 tokenizer = Tokenizer(inputCol="text", outputCol="words")
@@ -607,13 +596,12 @@ pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
 # Fit the pipeline to training documents.
 model = pipeline.fit(training)
 
-# Prepare test documents, which are unlabeled.
-Document = Row("id", "text")
-test = sc.parallelize([(4L, "spark i j k"),
-                       (5L, "l m n"),
-                       (6L, "mapreduce spark"),
-                       (7L, "apache hadoop")]) \
-    .map(lambda x: Document(*x)).toDF()
+# Prepare test documents, which are unlabeled (id, text) tuples.
+test = sqlContext.createDataFrame([
+    (4L, "spark i j k"),
+    (5L, "l m n"),
+    (6L, "mapreduce spark"),
+    (7L, "apache hadoop")], ["id", "text"])
 
 # Make predictions on test documents and print columns of interest.
 prediction = model.transform(test)
@@ -621,25 +609,24 @@ selected = prediction.select("id", "text", "prediction")
 for row in selected.collect():
     print(row)
 
-sc.stop()
 {% endhighlight %}
 </div>
 
 </div>
 
-## Example: Model Selection via Cross-Validation
+## Example: model selection via cross-validation
 
 An important task in ML is *model selection*, or using data to find the best model or parameters for a given task.  This is also called *tuning*.
 `Pipeline`s facilitate model selection by making it easy to tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately.
 
-Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.Evaluator).
+Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.Evaluator).
 `CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets; e.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing.
 `CrossValidator` iterates through the set of `ParamMap`s. For each `ParamMap`, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
 
-The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.RegressionEvaluator)
-for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.BinaryClassificationEvaluator)
-for binary data or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.MultiClassClassificationEvaluator)
-for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the setMetric
+The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator)
+for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)
+for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MultiClassClassificationEvaluator)
+for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the `setMetric`
 method in each of these evaluators.
 
 The `ParamMap` which produces the best evaluation metric (averaged over the `$k$` folds) is selected as the best model.
@@ -658,39 +645,29 @@ However, it is also a well-established method for choosing parameters which is m
 
 <div data-lang="scala">
 {% highlight scala %}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.{Row, SQLContext}
-
-// Labeled and unlabeled instance types.
-// Spark SQL can infer schema from case classes.
-case class LabeledDocument(id: Long, text: String, label: Double)
-case class Document(id: Long, text: String)
-
-val conf = new SparkConf().setAppName("CrossValidatorExample")
-val sc = new SparkContext(conf)
-val sqlContext = new SQLContext(sc)
-import sqlContext.implicits._
-
-// Prepare training documents, which are labeled.
-val training = sc.parallelize(Seq(
-  LabeledDocument(0L, "a b c d e spark", 1.0),
-  LabeledDocument(1L, "b d", 0.0),
-  LabeledDocument(2L, "spark f g h", 1.0),
-  LabeledDocument(3L, "hadoop mapreduce", 0.0),
-  LabeledDocument(4L, "b spark who", 1.0),
-  LabeledDocument(5L, "g d a y", 0.0),
-  LabeledDocument(6L, "spark fly", 1.0),
-  LabeledDocument(7L, "was mapreduce", 0.0),
-  LabeledDocument(8L, "e spark program", 1.0),
-  LabeledDocument(9L, "a e c l", 0.0),
-  LabeledDocument(10L, "spark compile", 1.0),
-  LabeledDocument(11L, "hadoop software", 0.0)))
+import org.apache.spark.sql.Row
+
+// Prepare training data from a list of (id, text, label) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (0L, "a b c d e spark", 1.0),
+  (1L, "b d", 0.0),
+  (2L, "spark f g h", 1.0),
+  (3L, "hadoop mapreduce", 0.0),
+  (4L, "b spark who", 1.0),
+  (5L, "g d a y", 0.0),
+  (6L, "spark fly", 1.0),
+  (7L, "was mapreduce", 0.0),
+  (8L, "e spark program", 1.0),
+  (9L, "a e c l", 0.0),
+  (10L, "spark compile", 1.0),
+  (11L, "hadoop software", 0.0)
+)).toDF("id", "text", "label")
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 val tokenizer = new Tokenizer()
@@ -704,15 +681,6 @@ val lr = new LogisticRegression()
 val pipeline = new Pipeline()
   .setStages(Array(tokenizer, hashingTF, lr))
 
-// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-// This will allow us to jointly choose parameters for all Pipeline stages.
-// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-// Note that the evaluator here is a BinaryClassificationEvaluator and the default metric
-// used is areaUnderROC.
-val crossval = new CrossValidator()
-  .setEstimator(pipeline)
-  .setEvaluator(new BinaryClassificationEvaluator)
-
 // We use a ParamGridBuilder to construct a grid of parameters to search over.
 // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
 // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
@@ -720,28 +688,37 @@ val paramGrid = new ParamGridBuilder()
   .addGrid(hashingTF.numFeatures, Array(10, 100, 1000))
   .addGrid(lr.regParam, Array(0.1, 0.01))
   .build()
-crossval.setEstimatorParamMaps(paramGrid)
-crossval.setNumFolds(2) // Use 3+ in practice
+
+// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
+// This will allow us to jointly choose parameters for all Pipeline stages.
+// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
+// is areaUnderROC.
+val cv = new CrossValidator()
+  .setEstimator(pipeline)
+  .setEvaluator(new BinaryClassificationEvaluator)
+  .setEstimatorParamMaps(paramGrid)
+  .setNumFolds(2) // Use 3+ in practice
 
 // Run cross-validation, and choose the best set of parameters.
-val cvModel = crossval.fit(training.toDF)
+val cvModel = cv.fit(training)
 
-// Prepare test documents, which are unlabeled.
-val test = sc.parallelize(Seq(
-  Document(4L, "spark i j k"),
-  Document(5L, "l m n"),
-  Document(6L, "mapreduce spark"),
-  Document(7L, "apache hadoop")))
+// Prepare test documents, which are unlabeled (id, text) tuples.
+val test = sqlContext.createDataFrame(Seq(
+  (4L, "spark i j k"),
+  (5L, "l m n"),
+  (6L, "mapreduce spark"),
+  (7L, "apache hadoop")
+)).toDF("id", "text")
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
-cvModel.transform(test.toDF)
+cvModel.transform(test)
   .select("id", "text", "probability", "prediction")
   .collect()
   .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
-  println(s"($id, $text) --> prob=$prob, prediction=$prediction")
-}
+    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
+  }
 
-sc.stop()
 {% endhighlight %}
 </div>
 
@@ -750,8 +727,6 @@ sc.stop()
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineStage;
 import org.apache.spark.ml.classification.LogisticRegression;
@@ -764,7 +739,6 @@ import org.apache.spark.ml.tuning.CrossValidatorModel;
 import org.apache.spark.ml.tuning.ParamGridBuilder;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
 
 // Labeled and unlabeled instance types.
 // Spark SQL can infer schema from Java Beans.
@@ -796,12 +770,9 @@ public class LabeledDocument extends Document implements Serializable {
   public void setLabel(double label) { this.label = label; }
 }
 
-SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
 
 // Prepare training documents, which are labeled.
-List<LabeledDocument> localTraining = Arrays.asList(
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
   new LabeledDocument(0L, "a b c d e spark", 1.0),
   new LabeledDocument(1L, "b d", 0.0),
   new LabeledDocument(2L, "spark f g h", 1.0),
@@ -813,8 +784,8 @@ List<LabeledDocument> localTraining = Arrays.asList(
   new LabeledDocument(8L, "e spark program", 1.0),
   new LabeledDocument(9L, "a e c l", 0.0),
   new LabeledDocument(10L, "spark compile", 1.0),
-  new LabeledDocument(11L, "hadoop software", 0.0));
-DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+  new LabeledDocument(11L, "hadoop software", 0.0)
+), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -830,15 +801,6 @@ LogisticRegression lr = new LogisticRegression()
 Pipeline pipeline = new Pipeline()
   .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
 
-// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-// This will allow us to jointly choose parameters for all Pipeline stages.
-// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-// Note that the evaluator here is a BinaryClassificationEvaluator and the default metric
-// used is areaUnderROC.
-CrossValidator crossval = new CrossValidator()
-    .setEstimator(pipeline)
-    .setEvaluator(new BinaryClassificationEvaluator());
-
 // We use a ParamGridBuilder to construct a grid of parameters to search over.
 // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
 // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
@@ -846,19 +808,28 @@ ParamMap[] paramGrid = new ParamGridBuilder()
     .addGrid(hashingTF.numFeatures(), new int[]{10, 100, 1000})
     .addGrid(lr.regParam(), new double[]{0.1, 0.01})
     .build();
-crossval.setEstimatorParamMaps(paramGrid);
-crossval.setNumFolds(2); // Use 3+ in practice
+
+// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
+// This will allow us to jointly choose parameters for all Pipeline stages.
+// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
+// is areaUnderROC.
+CrossValidator cv = new CrossValidator()
+  .setEstimator(pipeline)
+  .setEvaluator(new BinaryClassificationEvaluator())
+  .setEstimatorParamMaps(paramGrid)
+  .setNumFolds(2); // Use 3+ in practice
 
 // Run cross-validation, and choose the best set of parameters.
-CrossValidatorModel cvModel = crossval.fit(training);
+CrossValidatorModel cvModel = cv.fit(training);
 
 // Prepare test documents, which are unlabeled.
-List<Document> localTest = Arrays.asList(
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
   new Document(4L, "spark i j k"),
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
-  new Document(7L, "apache hadoop"));
-DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+  new Document(7L, "apache hadoop")
+), Document.class);
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
 DataFrame predictions = cvModel.transform(test);
@@ -867,7 +838,120 @@ for (Row r: predictions.select("id", "text", "probability", "prediction").collec
       + ", prediction=" + r.get(3));
 }
 
-jsc.stop();
+{% endhighlight %}
+</div>
+
+</div>
+
+## Example: model selection via train validation split
+In addition to  `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning.
+`TrainValidationSplit` only evaluates each combination of parameters once as opposed to k times in
+ case of `CrossValidator`. It is therefore less expensive,
+ but will not produce as reliable results when the training dataset is not sufficiently large.
+
+`TrainValidationSplit` takes an `Estimator`, a set of `ParamMap`s provided in the `estimatorParamMaps` parameter,
+and an `Evaluator`.
+It begins by splitting the dataset into two parts using `trainRatio` parameter
+which are used as separate training and test datasets. For example with `$trainRatio=0.75$` (default),
+`TrainValidationSplit` will generate a training and test dataset pair where 75% of the data is used for training and 25% for validation.
+Similar to `CrossValidator`, `TrainValidationSplit` also iterates through the set of `ParamMap`s.
+For each combination of parameters, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
+The `ParamMap` which produces the best evaluation metric is selected as the best option.
+`TrainValidationSplit` finally fits the `Estimator` using the best `ParamMap` and the entire dataset.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
+import org.apache.spark.mllib.util.MLUtils
+
+// Prepare training and test data.
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
+
+val lr = new LinearRegression()
+
+// We use a ParamGridBuilder to construct a grid of parameters to search over.
+// TrainValidationSplit will try all combinations of values and determine best model using
+// the evaluator.
+val paramGrid = new ParamGridBuilder()
+  .addGrid(lr.regParam, Array(0.1, 0.01))
+  .addGrid(lr.fitIntercept)
+  .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
+  .build()
+
+// In this case the estimator is simply the linear regression.
+// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+val trainValidationSplit = new TrainValidationSplit()
+  .setEstimator(lr)
+  .setEvaluator(new RegressionEvaluator)
+  .setEstimatorParamMaps(paramGrid)
+  // 80% of the data will be used for training and the remaining 20% for validation.
+  .setTrainRatio(0.8)
+
+// Run train validation split, and choose the best set of parameters.
+val model = trainValidationSplit.fit(training)
+
+// Make predictions on test data. model is the model with combination of parameters
+// that performed best.
+model.transform(test)
+  .select("features", "label", "prediction")
+  .show()
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.param.ParamMap;
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.tuning.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.DataFrame;
+
+DataFrame data = sqlContext.createDataFrame(
+  MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
+  LabeledPoint.class);
+
+// Prepare training and test data.
+DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);
+DataFrame training = splits[0];
+DataFrame test = splits[1];
+
+LinearRegression lr = new LinearRegression();
+
+// We use a ParamGridBuilder to construct a grid of parameters to search over.
+// TrainValidationSplit will try all combinations of values and determine best model using
+// the evaluator.
+ParamMap[] paramGrid = new ParamGridBuilder()
+  .addGrid(lr.regParam(), new double[] {0.1, 0.01})
+  .addGrid(lr.fitIntercept())
+  .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
+  .build();
+
+// In this case the estimator is simply the linear regression.
+// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
+  .setEstimator(lr)
+  .setEvaluator(new RegressionEvaluator())
+  .setEstimatorParamMaps(paramGrid)
+  .setTrainRatio(0.8); // 80% for training and the remaining 20% for validation
+
+// Run train validation split, and choose the best set of parameters.
+TrainValidationSplitModel model = trainValidationSplit.fit(training);
+
+// Make predictions on test data. model is the model with combination of parameters
+// that performed best.
+model.transform(test)
+  .select("features", "label", "prediction")
+  .show();
+
 {% endhighlight %}
 </div>
 
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index cdd9d4999fa1b..4e94e2f9c708d 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -59,10 +59,9 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.mllib.util.MLUtils
 
 // Load training data
-val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 val lr = new LogisticRegression()
   .setMaxIter(10)
@@ -81,8 +80,6 @@ println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
 {% highlight java %}
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 import org.apache.spark.sql.DataFrame;
@@ -98,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
     String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
-    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+    DataFrame training = sqlContext.read.format("libsvm").load(path);
 
     LogisticRegression lr = new LogisticRegression()
       .setMaxIter(10)
@@ -118,11 +115,9 @@ public class LogisticRegressionWithElasticNetExample {
 <div data-lang="python" markdown="1">
 {% highlight python %}
 from pyspark.ml.classification import LogisticRegression
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.util import MLUtils
 
 # Load training data
-training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
 
@@ -251,10 +246,9 @@ regression model and extracting model summary statistics.
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 import org.apache.spark.ml.regression.LinearRegression
-import org.apache.spark.mllib.util.MLUtils
 
 // Load training data
-val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 val lr = new LinearRegression()
   .setMaxIter(10)
@@ -283,8 +277,6 @@ import org.apache.spark.ml.regression.LinearRegression;
 import org.apache.spark.ml.regression.LinearRegressionModel;
 import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
 import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 import org.apache.spark.sql.DataFrame;
@@ -300,7 +292,7 @@ public class LinearRegressionWithElasticNetExample {
     String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
-    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+    DataFrame training = sqlContext.read.format("libsvm").load(path);
 
     LinearRegression lr = new LinearRegression()
       .setMaxIter(10)
@@ -329,11 +321,9 @@ public class LinearRegressionWithElasticNetExample {
 <!--- TODO: Add python model summaries once implemented -->
 {% highlight python %}
 from pyspark.ml.regression import LinearRegression
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.util import MLUtils
 
 # Load training data
-training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
 
 lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
 
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 3fb35d3c50b06..c2711cf82deb4 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -507,6 +507,10 @@ must also be $> 1.0$. Providing `Vector(-1)` results in default behavior
 $> 1.0$. Providing `-1` results in defaulting to a value of $0.1 + 1$.
 * `maxIterations`: The maximum number of EM iterations.
 
+*Note*: It is important to do enough iterations.  In early iterations, EM often has useless topics,
+but those topics improve dramatically after more iterations.  Using at least 20 and possibly
+50-100 iterations is often reasonable, depending on your dataset.
+
 `EMLDAOptimizer` produces a `DistributedLDAModel`, which stores not only
 the inferred topics but also the full training corpus and topic
 distributions for each document in the training corpus. A
diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 065bf4727624f..d8c7bdc63c70e 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -144,7 +144,7 @@ import org.apache.spark.mllib.regression.LabeledPoint;
 LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
 
 // Create a labeled point with a negative label and a sparse feature vector.
-LabeledPoint neg = new LabeledPoint(1.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}));
+LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[] {0, 2}, new double[] {1.0, 3.0}));
 {% endhighlight %}
 </div>
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index de86aba2ae627..7e417ed5f37a9 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -380,35 +380,43 @@ data2 = labels.zip(normalizer2.transform(features))
 </div>
 </div>
 
-## Feature selection
-[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) allows selecting the most relevant features for use in model construction. Feature selection reduces the size of the vector space and, in turn, the complexity of any subsequent operation with vectors. The number of features to select can be tuned using a held-out validation set.
+## ChiSqSelector
 
-### ChiSqSelector
-[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) stands for Chi-Squared feature selection. It operates on labeled data with categorical features. `ChiSqSelector` orders features based on a Chi-Squared test of independence from the class, and then filters (selects) the top features which the class label depends on the most. This is akin to yielding the features with the most predictive power.
+[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) tries to identify relevant
+features for use in model construction. It reduces the size of the feature space, which can improve
+both speed and statistical learning behavior.
 
-#### Model Fitting
+[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) implements
+Chi-Squared feature selection. It operates on labeled data with categorical features.
+`ChiSqSelector` orders features based on a Chi-Squared test of independence from the class,
+and then filters (selects) the top features which the class label depends on the most.
+This is akin to yielding the features with the most predictive power.
 
-[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) has the
-following parameters in the constructor:
+The number of features to select can be tuned using a held-out validation set.
 
-* `numTopFeatures` number of top features that the selector will select (filter).
+### Model Fitting
 
-We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method in
-`ChiSqSelector` which can take an input of `RDD[LabeledPoint]` with categorical features, learn the summary statistics, and then
-return a `ChiSqSelectorModel` which can transform an input dataset into the reduced feature space.
+`ChiSqSelector` takes a `numTopFeatures` parameter specifying the number of top features that
+the selector will select.
 
-This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
-which can apply the Chi-Squared feature selection on a `Vector` to produce a reduced `Vector` or on
+The [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method takes
+an input of `RDD[LabeledPoint]` with categorical features, learns the summary statistics, and then
+returns a `ChiSqSelectorModel` which can transform an input dataset into the reduced feature space.
+The `ChiSqSelectorModel` can be applied either to a `Vector` to produce a reduced `Vector`, or to
 an `RDD[Vector]` to produce a reduced `RDD[Vector]`.
 
 Note that the user can also construct a `ChiSqSelectorModel` by hand by providing an array of selected feature indices (which must be sorted in ascending order).
 
-#### Example
+### Example
 
 The following example shows the basic use of ChiSqSelector. The data set used has a feature matrix consisting of greyscale values that vary from 0 to 255 for each feature.
 
 <div class="codetabs">
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector)
+for details on the API.
+
 {% highlight scala %}
 import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.linalg.Vectors
@@ -434,7 +442,11 @@ val filteredData = discretizedData.map { lp =>
 {% endhighlight %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+
+Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html)
+for details on the API.
+
 {% highlight java %}
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@@ -486,7 +498,12 @@ sc.stop();
 
 ## ElementwiseProduct
 
-ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier.  This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
+`ElementwiseProduct` multiplies each input vector by a provided "weight" vector, using element-wise
+multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This
+represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29)
+between the input vector, `v` and transforming vector, `scalingVec`, to yield a result vector.
+Qu8T948*1#
+Denoting the `scalingVec` as "`w`," this transformation may be written as:
 
 `\[ \begin{pmatrix}
 v_1 \\
@@ -506,7 +523,7 @@ v_N
 
 [`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
 
-* `w`: the transforming vector.
+* `scalingVec`: the transforming vector.
 
 `ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
 
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 876dcfd40ed7b..91e50ccfecec4 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -13,10 +13,10 @@ primitives and higher-level pipeline APIs.
 
 It divides into two packages:
 
-* [`spark.mllib`](mllib-guide.html#mllib-types-algorithms-and-utilities) contains the original API
-  built on top of RDDs.
-* [`spark.ml`](mllib-guide.html#sparkml-high-level-apis-for-ml-pipelines) provides higher-level API
-  built on top of DataFrames for constructing ML pipelines.
+* [`spark.mllib`](mllib-guide.html#data-types-algorithms-and-utilities) contains the original API
+  built on top of [RDDs](programming-guide.html#resilient-distributed-datasets-rdds).
+* [`spark.ml`](ml-guide.html) provides higher-level API
+  built on top of [DataFrames](sql-programming-guide.html#dataframes) for constructing ML pipelines.
 
 Using `spark.ml` is recommended because with DataFrames the API is more versatile and flexible.
 But we will keep supporting `spark.mllib` along with the development of `spark.ml`.
@@ -57,19 +57,19 @@ We list major functionality from both below, with links to detailed guides.
   * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth)
   * [association rules](mllib-frequent-pattern-mining.html#association-rules)
   * [PrefixSpan](mllib-frequent-pattern-mining.html#prefix-span)
-* [Evaluation Metrics](mllib-evaluation-metrics.html)
+* [Evaluation metrics](mllib-evaluation-metrics.html)
+* [PMML model export](mllib-pmml-model-export.html)
 * [Optimization (developer)](mllib-optimization.html)
   * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd)
   * [limited-memory BFGS (L-BFGS)](mllib-optimization.html#limited-memory-bfgs-l-bfgs)
-* [PMML model export](mllib-pmml-model-export.html)
 
 # spark.ml: high-level APIs for ML pipelines
 
 **[spark.ml programming guide](ml-guide.html)** provides an overview of the Pipelines API and major
 concepts. It also contains sections on using algorithms within the Pipelines API, for example:
 
-* [Feature Extraction, Transformation, and Selection](ml-features.html)
-* [Decision Trees for Classification and Regression](ml-decision-tree.html)
+* [Feature extraction, transformation, and selection](ml-features.html)
+* [Decision trees for classification and regression](ml-decision-tree.html)
 * [Ensembles](ml-ensembles.html)
 * [Linear methods with elastic net regularization](ml-linear-methods.html)
 * [Multilayer perceptron classifier](ml-ann.html)
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 4cf83bb392636..22656fd7910c0 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -182,8 +182,8 @@ in-process.
 In the Spark shell, a special interpreter-aware SparkContext is already created for you, in the
 variable called `sc`. Making your own SparkContext will not work. You can set which master the
 context connects to using the `--master` argument, and you can add JARs to the classpath
-by passing a comma-separated list to the `--jars` argument. You can also add dependencies 
-(e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates 
+by passing a comma-separated list to the `--jars` argument. You can also add dependencies
+(e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
 to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
 can be passed to the `--repositories` argument. For example, to run `bin/spark-shell` on exactly
 four cores, use:
@@ -217,7 +217,7 @@ context connects to using the `--master` argument, and you can add Python .zip,
 to the runtime path by passing a comma-separated list to `--py-files`. You can also add dependencies
 (e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
 to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
-can be passed to the `--repositories` argument. Any python dependencies a Spark Package has (listed in 
+can be passed to the `--repositories` argument. Any python dependencies a Spark Package has (listed in
 the requirements.txt of that package) must be manually installed using pip when necessary.
 For example, to run `bin/pyspark` on exactly four cores, use:
 
@@ -249,8 +249,8 @@ the [IPython Notebook](http://ipython.org/notebook.html) with PyLab plot support
 $ PYSPARK_DRIVER_PYTHON=ipython PYSPARK_DRIVER_PYTHON_OPTS="notebook" ./bin/pyspark
 {% endhighlight %}
 
-After the IPython Notebook server is launched, you can create a new "Python 2" notebook from 
-the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of 
+After the IPython Notebook server is launched, you can create a new "Python 2" notebook from
+the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
 your notebook before you start to try Spark from the IPython notebook.
 
 </div>
@@ -418,9 +418,9 @@ Apart from text files, Spark's Python API also supports several other data forma
 
 **Writable Support**
 
-PySpark SequenceFile support loads an RDD of key-value pairs within Java, converts Writables to base Java types, and pickles the 
-resulting Java objects using [Pyrolite](https://github.com/irmen/Pyrolite/). When saving an RDD of key-value pairs to SequenceFile, 
-PySpark does the reverse. It unpickles Python objects into Java objects and then converts them to Writables. The following 
+PySpark SequenceFile support loads an RDD of key-value pairs within Java, converts Writables to base Java types, and pickles the
+resulting Java objects using [Pyrolite](https://github.com/irmen/Pyrolite/). When saving an RDD of key-value pairs to SequenceFile,
+PySpark does the reverse. It unpickles Python objects into Java objects and then converts them to Writables. The following
 Writables are automatically converted:
 
 <table class="table">
@@ -435,9 +435,9 @@ Writables are automatically converted:
 <tr><td>MapWritable</td><td>dict</td></tr>
 </table>
 
-Arrays are not handled out-of-the-box. Users need to specify custom `ArrayWritable` subtypes when reading or writing. When writing, 
-users also need to specify custom converters that convert arrays to custom `ArrayWritable` subtypes. When reading, the default 
-converter will convert custom `ArrayWritable` subtypes to Java `Object[]`, which then get pickled to Python tuples. To get 
+Arrays are not handled out-of-the-box. Users need to specify custom `ArrayWritable` subtypes when reading or writing. When writing,
+users also need to specify custom converters that convert arrays to custom `ArrayWritable` subtypes. When reading, the default
+converter will convert custom `ArrayWritable` subtypes to Java `Object[]`, which then get pickled to Python tuples. To get
 Python `array.array` for arrays of primitive types, users need to specify custom converters.
 
 **Saving and Loading SequenceFiles**
@@ -454,7 +454,7 @@ classes can be specified, but for standard Writables this is not required.
 
 **Saving and Loading Other Hadoop Input/Output Formats**
 
-PySpark can also read any Hadoop InputFormat or write any Hadoop OutputFormat, for both 'new' and 'old' Hadoop MapReduce APIs. 
+PySpark can also read any Hadoop InputFormat or write any Hadoop OutputFormat, for both 'new' and 'old' Hadoop MapReduce APIs.
 If required, a Hadoop configuration can be passed in as a Python dict. Here is an example using the
 Elasticsearch ESInputFormat:
 
@@ -474,15 +474,15 @@ Note that, if the InputFormat simply depends on a Hadoop configuration and/or in
 the key and value classes can easily be converted according to the above table,
 then this approach should work well for such cases.
 
-If you have custom serialized binary data (such as loading data from Cassandra / HBase), then you will first need to 
+If you have custom serialized binary data (such as loading data from Cassandra / HBase), then you will first need to
 transform that data on the Scala/Java side to something which can be handled by Pyrolite's pickler.
-A [Converter](api/scala/index.html#org.apache.spark.api.python.Converter) trait is provided 
-for this. Simply extend this trait and implement your transformation code in the ```convert``` 
-method. Remember to ensure that this class, along with any dependencies required to access your ```InputFormat```, are packaged into your Spark job jar and included on the PySpark 
+A [Converter](api/scala/index.html#org.apache.spark.api.python.Converter) trait is provided
+for this. Simply extend this trait and implement your transformation code in the ```convert```
+method. Remember to ensure that this class, along with any dependencies required to access your ```InputFormat```, are packaged into your Spark job jar and included on the PySpark
 classpath.
 
-See the [Python examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python) and 
-the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/pythonconverters) 
+See the [Python examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python) and
+the [Converter examples]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/pythonconverters)
 for examples of using Cassandra / HBase ```InputFormat``` and ```OutputFormat``` with custom converters.
 
 </div>
@@ -758,7 +758,7 @@ One of the harder things about Spark is understanding the scope and life cycle o
 
 #### Example
 
-Consider the naive RDD element sum below, which behaves completely differently depending on whether execution is happening within the same JVM. A common example of this is when running Spark in `local` mode (`--master = local[n]`) versus deploying a Spark application to a cluster (e.g. via spark-submit to YARN): 
+Consider the naive RDD element sum below, which behaves completely differently depending on whether execution is happening within the same JVM. A common example of this is when running Spark in `local` mode (`--master = local[n]`) versus deploying a Spark application to a cluster (e.g. via spark-submit to YARN):
 
 <div class="codetabs">
 
@@ -777,7 +777,7 @@ println("Counter value: " + counter)
 <div data-lang="java"  markdown="1">
 {% highlight java %}
 int counter = 0;
-JavaRDD<Integer> rdd = sc.parallelize(data); 
+JavaRDD<Integer> rdd = sc.parallelize(data);
 
 // Wrong: Don't do this!!
 rdd.foreach(x -> counter += x);
@@ -803,19 +803,19 @@ print("Counter value: " + counter)
 
 #### Local vs. cluster modes
 
-The primary challenge is that the behavior of the above code is undefined. In local mode with a single JVM, the above code will sum the values within the RDD and store it in **counter**. This is because both the RDD and the variable **counter** are in the same memory space on the driver node. 
+The primary challenge is that the behavior of the above code is undefined. In local mode with a single JVM, the above code will sum the values within the RDD and store it in **counter**. This is because both the RDD and the variable **counter** are in the same memory space on the driver node.
 
-However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on seperate worker nodes each have their own copy of the closure.
+However, in `cluster` mode, what happens is more complicated, and the above may not work as intended. To execute jobs, Spark breaks up the processing of RDD operations into tasks - each of which is operated on by an executor. Prior to execution, Spark computes the **closure**. The closure is those variables and methods which must be visible for the executor to perform its computations on the RDD (in this case `foreach()`). This closure is serialized and sent to each executor. In `local` mode, there is only the one executors so everything shares the same closure. In other modes however, this is not the case and the executors running on separate worker nodes each have their own copy of the closure.
 
-What is happening here is that the variables within the closure sent to each executor are now copies and thus, when **counter** is referenced within the `foreach` function, it's no longer the **counter** on the driver node. There is still a **counter** in the memory of the driver node but this is no longer visible to the executors! The executors only sees the copy from the serialized closure. Thus, the final value of **counter** will still be zero since all operations on **counter** were referencing the value within the serialized closure.  
+What is happening here is that the variables within the closure sent to each executor are now copies and thus, when **counter** is referenced within the `foreach` function, it's no longer the **counter** on the driver node. There is still a **counter** in the memory of the driver node but this is no longer visible to the executors! The executors only see the copy from the serialized closure. Thus, the final value of **counter** will still be zero since all operations on **counter** were referencing the value within the serialized closure.  
 
 To ensure well-defined behavior in these sorts of scenarios one should use an [`Accumulator`](#AccumLink). Accumulators in Spark are used specifically to provide a mechanism for safely updating a variable when execution is split up across worker nodes in a cluster. The Accumulators section of this guide discusses these in more detail.  
 
 In general, closures - constructs like loops or locally defined methods, should not be used to mutate some global state. Spark does not define or guarantee the behavior of mutations to objects referenced from outside of closures. Some code that does this may work in local mode, but that's just by accident and such code will not behave as expected in distributed mode. Use an Accumulator instead if some global aggregation is needed.
 
-#### Printing elements of an RDD 
+#### Printing elements of an RDD
 Another common idiom is attempting to print out the elements of an RDD using `rdd.foreach(println)` or `rdd.map(println)`. On a single machine, this will generate the expected output and print all the RDD's elements. However, in `cluster` mode, the output to `stdout` being called by the executors is now writing to the executor's `stdout` instead, not the one on the driver, so `stdout` on the driver won't show these! To print all elements on the driver, one can use the `collect()` method to first bring the RDD to the driver node thus: `rdd.collect().foreach(println)`. This can cause the driver to run out of memory, though, because `collect()` fetches the entire RDD to a single machine; if you only need to print a few elements of the RDD, a safer approach is to use the `take()`: `rdd.take(100).foreach(println)`.
- 
+
 ### Working with Key-Value Pairs
 
 <div class="codetabs">
@@ -859,7 +859,7 @@ only available on RDDs of key-value pairs.
 The most common ones are distributed "shuffle" operations, such as grouping or aggregating the elements
 by a key.
 
-In Java, key-value pairs are represented using the 
+In Java, key-value pairs are represented using the
 [scala.Tuple2](http://www.scala-lang.org/api/{{site.SCALA_VERSION}}/index.html#scala.Tuple2) class
 from the Scala standard library. You can simply call `new Tuple2(a, b)` to create a tuple, and access
 its fields later with `tuple._1()` and `tuple._2()`.
@@ -974,7 +974,7 @@ for details.
   <td> <b>groupByKey</b>([<i>numTasks</i>]) <a name="GroupByLink"></a> </td>
   <td> When called on a dataset of (K, V) pairs, returns a dataset of (K, Iterable&lt;V&gt;) pairs. <br />
     <b>Note:</b> If you are grouping in order to perform an aggregation (such as a sum or
-      average) over each key, using <code>reduceByKey</code> or <code>aggregateByKey</code> will yield much better 
+      average) over each key, using <code>reduceByKey</code> or <code>aggregateByKey</code> will yield much better
       performance.
     <br />
     <b>Note:</b> By default, the level of parallelism in the output depends on the number of partitions of the parent RDD.
@@ -1025,7 +1025,7 @@ for details.
 <tr>
   <td> <b>repartitionAndSortWithinPartitions</b>(<i>partitioner</i>) <a name="Repartition2Link"></a></td>
   <td> Repartition the RDD according to the given partitioner and, within each resulting partition,
-  sort records by their keys. This is more efficient than calling <code>repartition</code> and then sorting within 
+  sort records by their keys. This is more efficient than calling <code>repartition</code> and then sorting within
   each partition because it can push the sorting down into the shuffle machinery. </td>
 </tr>
 </table>
@@ -1038,7 +1038,7 @@ RDD API doc
  [Java](api/java/index.html?org/apache/spark/api/java/JavaRDD.html),
  [Python](api/python/pyspark.html#pyspark.RDD),
  [R](api/R/index.html))
- 
+
 and pair RDD functions doc
 ([Scala](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions),
  [Java](api/java/index.html?org/apache/spark/api/java/JavaPairRDD.html))
@@ -1094,7 +1094,7 @@ for details.
 </tr>
 <tr>
   <td> <b>foreach</b>(<i>func</i>) </td>
-  <td> Run a function <i>func</i> on each element of the dataset. This is usually done for side effects such as updating an <a href="#AccumLink">Accumulator</a> or interacting with external storage systems. 
+  <td> Run a function <i>func</i> on each element of the dataset. This is usually done for side effects such as updating an <a href="#AccumLink">Accumulator</a> or interacting with external storage systems.
   <br /><b>Note</b>: modifying variables other than Accumulators outside of the <code>foreach()</code> may result in undefined behavior. See <a href="#ClosuresLink">Understanding closures </a> for more details.</td>
 </tr>
 </table>
@@ -1118,13 +1118,13 @@ co-located to compute the result.
 In Spark, data is generally not distributed across partitions to be in the necessary place for a
 specific operation. During computations, a single task will operate on a single partition - thus, to
 organize all the data for a single `reduceByKey` reduce task to execute, Spark needs to perform an
-all-to-all operation. It must read from all partitions to find all the values for all keys, 
-and then bring together values across partitions to compute the final result for each key - 
+all-to-all operation. It must read from all partitions to find all the values for all keys,
+and then bring together values across partitions to compute the final result for each key -
 this is called the **shuffle**.
 
 Although the set of elements in each partition of newly shuffled data will be deterministic, and so
-is the ordering of partitions themselves, the ordering of these elements is not. If one desires predictably 
-ordered data following shuffle then it's possible to use: 
+is the ordering of partitions themselves, the ordering of these elements is not. If one desires predictably
+ordered data following shuffle then it's possible to use:
 
 * `mapPartitions` to sort each partition using, for example, `.sorted`
 * `repartitionAndSortWithinPartitions` to efficiently sort partitions while simultaneously repartitioning
@@ -1141,26 +1141,26 @@ network I/O. To organize data for the shuffle, Spark generates sets of tasks - *
 organize the data, and a set of *reduce* tasks to aggregate it. This nomenclature comes from
 MapReduce and does not directly relate to Spark's `map` and `reduce` operations.
 
-Internally, results from individual map tasks are kept in memory until they can't fit. Then, these 
-are sorted based on the target partition and written to a single file. On the reduce side, tasks 
+Internally, results from individual map tasks are kept in memory until they can't fit. Then, these
+are sorted based on the target partition and written to a single file. On the reduce side, tasks
 read the relevant sorted blocks.
-        
-Certain shuffle operations can consume significant amounts of heap memory since they employ 
-in-memory data structures to organize records before or after transferring them. Specifically, 
-`reduceByKey` and `aggregateByKey` create these structures on the map side, and `'ByKey` operations 
-generate these on the reduce side. When data does not fit in memory Spark will spill these tables 
+
+Certain shuffle operations can consume significant amounts of heap memory since they employ
+in-memory data structures to organize records before or after transferring them. Specifically,
+`reduceByKey` and `aggregateByKey` create these structures on the map side, and `'ByKey` operations
+generate these on the reduce side. When data does not fit in memory Spark will spill these tables
 to disk, incurring the additional overhead of disk I/O and increased garbage collection.
 
 Shuffle also generates a large number of intermediate files on disk. As of Spark 1.3, these files
-are preserved until the corresponding RDDs are no longer used and are garbage collected. 
-This is done so the shuffle files don't need to be re-created if the lineage is re-computed. 
-Garbage collection may happen only after a long period time, if the application retains references 
-to these RDDs or if GC does not kick in frequently. This means that long-running Spark jobs may 
+are preserved until the corresponding RDDs are no longer used and are garbage collected.
+This is done so the shuffle files don't need to be re-created if the lineage is re-computed.
+Garbage collection may happen only after a long period time, if the application retains references
+to these RDDs or if GC does not kick in frequently. This means that long-running Spark jobs may
 consume a large amount of disk space. The temporary storage directory is specified by the
 `spark.local.dir` configuration parameter when configuring the Spark context.
 
 Shuffle behavior can be tuned by adjusting a variety of configuration parameters. See the
-'Shuffle Behavior' section within the [Spark Configuration Guide](configuration.html). 
+'Shuffle Behavior' section within the [Spark Configuration Guide](configuration.html).
 
 ## RDD Persistence
 
@@ -1246,7 +1246,7 @@ efficiency. We recommend going through the following process to select one:
   This is the most CPU-efficient option, allowing operations on the RDDs to run as fast as possible.
 
 * If not, try using `MEMORY_ONLY_SER` and [selecting a fast serialization library](tuning.html) to
-make the objects much more space-efficient, but still reasonably fast to access. 
+make the objects much more space-efficient, but still reasonably fast to access.
 
 * Don't spill to disk unless the functions that computed your datasets are expensive, or they filter
 a large amount of the data. Otherwise, recomputing a partition may be as fast as reading it from
@@ -1345,7 +1345,7 @@ Accumulators are variables that are only "added" to through an associative opera
 therefore be efficiently supported in parallel. They can be used to implement counters (as in
 MapReduce) or sums. Spark natively supports accumulators of numeric types, and programmers
 can add support for new types. If accumulators are created with a name, they will be
-displayed in Spark's UI. This can be useful for understanding the progress of 
+displayed in Spark's UI. This can be useful for understanding the progress of
 running stages (NOTE: this is not yet supported in Python).
 
 An accumulator is created from an initial value `v` by calling `SparkContext.accumulator(v)`. Tasks
@@ -1474,8 +1474,8 @@ vecAccum = sc.accumulator(Vector(...), VectorAccumulatorParam())
 
 </div>
 
-For accumulator updates performed inside <b>actions only</b>, Spark guarantees that each task's update to the accumulator 
-will only be applied once, i.e. restarted tasks will not update the value. In transformations, users should be aware 
+For accumulator updates performed inside <b>actions only</b>, Spark guarantees that each task's update to the accumulator
+will only be applied once, i.e. restarted tasks will not update the value. In transformations, users should be aware
 of that each task's update may be applied more than once if tasks or job stages are re-executed.
 
 Accumulators do not change the lazy evaluation model of Spark. If they are being updated within an operation on an RDD, their value is only updated once that RDD is computed as part of an action. Consequently, accumulator updates are not guaranteed to be executed when made within a lazy transformation like `map()`. The below code fragment demonstrates this property:
@@ -1486,7 +1486,7 @@ Accumulators do not change the lazy evaluation model of Spark. If they are being
 {% highlight scala %}
 val accum = sc.accumulator(0)
 data.map { x => accum += x; f(x) }
-// Here, accum is still 0 because no actions have caused the `map` to be computed.
+// Here, accum is still 0 because no actions have caused the <code>map</code> to be computed.
 {% endhighlight %}
 </div>
 
@@ -1553,7 +1553,7 @@ Several changes were made to the Java API:
   code that `extends Function` should `implement Function` instead.
 * New variants of the `map` transformations, like `mapToPair` and `mapToDouble`, were added to create RDDs
   of special data types.
-* Grouping operations like `groupByKey`, `cogroup` and `join` have changed from returning 
+* Grouping operations like `groupByKey`, `cogroup` and `join` have changed from returning
   `(Key, List<Value>)` pairs to `(Key, Iterable<Value>)`.
 
 </div>
diff --git a/docs/quick-start.md b/docs/quick-start.md
index ce2cc9d2169cd..d481fe0ea6d70 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -126,7 +126,7 @@ scala> val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (w
 wordCounts: spark.RDD[(String, Int)] = spark.ShuffledAggregatedRDD@71f027b8
 {% endhighlight %}
 
-Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
+Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
 
 {% highlight scala %}
 scala> wordCounts.collect()
@@ -163,7 +163,7 @@ One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can i
 >>> wordCounts = textFile.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
 {% endhighlight %}
 
-Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
+Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
 
 {% highlight python %}
 >>> wordCounts.collect()
@@ -217,13 +217,13 @@ a cluster, as described in the [programming guide](programming-guide.html#initia
 </div>
 
 # Self-Contained Applications
-Now say we wanted to write a self-contained application using the Spark API. We will walk through a
-simple application in both Scala (with SBT), Java (with Maven), and Python.
+Suppose we wish to write a self-contained application using the Spark API. We will walk through a
+simple application in Scala (with sbt), Java (with Maven), and Python.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-We'll create a very simple Spark application in Scala. So simple, in fact, that it's
+We'll create a very simple Spark application in Scala--so simple, in fact, that it's
 named `SimpleApp.scala`:
 
 {% highlight scala %}
@@ -259,7 +259,7 @@ object which contains information about our
 application. 
 
 Our application depends on the Spark API, so we'll also include an sbt configuration file, 
-`simple.sbt` which explains that Spark is a dependency. This file also adds a repository that 
+`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that 
 Spark depends on:
 
 {% highlight scala %}
@@ -302,7 +302,7 @@ Lines with a: 46, Lines with b: 23
 
 </div>
 <div data-lang="java" markdown="1">
-This example will use Maven to compile an application jar, but any similar build system will work.
+This example will use Maven to compile an application JAR, but any similar build system will work.
 
 We'll create a very simple Spark application, `SimpleApp.java`:
 
@@ -374,7 +374,7 @@ $ find .
 Now, we can package the application using Maven and execute it with `./bin/spark-submit`.
 
 {% highlight bash %}
-# Package a jar containing your application
+# Package a JAR containing your application
 $ mvn package
 ...
 [INFO] Building jar: {..}/{..}/target/simple-project-1.0.jar
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index cfd219ab02e26..ec5a44d79212b 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -45,7 +45,7 @@ frameworks.  You can install Mesos either from source or using prebuilt packages
 To install Apache Mesos from source, follow these steps:
 
 1. Download a Mesos release from a
-   [mirror](http://www.apache.org/dyn/closer.cgi/mesos/{{site.MESOS_VERSION}}/)
+   [mirror](http://www.apache.org/dyn/closer.lua/mesos/{{site.MESOS_VERSION}}/)
 2. Follow the Mesos [Getting Started](http://mesos.apache.org/gettingstarted) page for compiling and
    installing Mesos
 
@@ -157,6 +157,8 @@ From the client, you can submit a job to Mesos cluster by running `spark-submit`
 to the url of the MesosClusterDispatcher (e.g: mesos://dispatcher:7077). You can view driver statuses on the
 Spark cluster Web UI.
 
+Note that jars or python files that are passed to spark-submit should be URIs reachable by Mesos slaves.
+
 # Mesos Run Modes
 
 Spark can run over Mesos in two modes: "fine-grained" (default) and "coarse-grained".
@@ -187,10 +189,10 @@ using `conf.set("spark.cores.max", "10")` (for example).
 You may also make use of `spark.mesos.constraints` to set attribute based constraints on mesos resource offers. By default, all resource offers will be accepted.
 
 {% highlight scala %}
-conf.set("spark.mesos.constraints", "tachyon=true;us-east-1=false")
+conf.set("spark.mesos.constraints", "tachyon:true;us-east-1:false")
 {% endhighlight %}
 
-For example, Let's say `spark.mesos.constraints` is set to `tachyon=true;us-east-1=false`, then the resource offers will be checked to see if they meet both these constraints and only then will be accepted to start new executors.
+For example, Let's say `spark.mesos.constraints` is set to `tachyon:true;us-east-1:false`, then the resource offers will be checked to see if they meet both these constraints and only then will be accepted to start new executors.
 
 # Mesos Docker Support
 
@@ -243,7 +245,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td><code>spark.mesos.coarse</code></td>
   <td>false</td>
   <td>
-    If set to "true", runs over Mesos clusters in
+    If set to <code>true</code>, runs over Mesos clusters in
     <a href="running-on-mesos.html#mesos-run-modes">"coarse-grained" sharing mode</a>,
     where Spark acquires one long-lived Mesos task on each machine instead of one Mesos task per
     Spark task. This gives lower-latency scheduling for short queries, but leaves resources in use
@@ -252,16 +254,16 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.extra.cores</code></td>
-  <td>0</td>
+  <td><code>0</code></td>
   <td>
     Set the extra amount of cpus to request per task. This setting is only used for Mesos coarse grain mode.
     The total amount of cores requested per task is the number of cores in the offer plus the extra cores configured.
-    Note that total amount of cores the executor will request in total will not exceed the spark.cores.max setting.
+    Note that total amount of cores the executor will request in total will not exceed the <code>spark.cores.max</code> setting.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.mesosExecutor.cores</code></td>
-  <td>1.0</td>
+  <td><code>1.0</code></td>
   <td>
     (Fine-grained mode only) Number of cores to give each Mesos executor. This does not
     include the cores used to run the Spark tasks. In other words, even if no Spark task
@@ -285,7 +287,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td>
     Set the list of volumes which will be mounted into the Docker image, which was set using
     <code>spark.mesos.executor.docker.image</code>. The format of this property is a comma-separated list of
-    mappings following the form passed to <tt>docker run -v</tt>. That is they take the form:
+    mappings following the form passed to <code>docker run -v</code>. That is they take the form:
 
     <pre>[host_path:]container_path[:ro|:rw]</pre>
   </td>
@@ -316,7 +318,7 @@ See the [configuration page](configuration.html) for information on Spark config
   <td>executor memory * 0.10, with minimum of 384</td>
   <td>
     The amount of additional memory, specified in MB, to be allocated per executor. By default,
-    the overhead will be larger of either 384 or 10% of `spark.executor.memory`. If it's set,
+    the overhead will be larger of either 384 or 10% of <code>spark.executor.memory</code>. If set,
     the final overhead will be this value.
   </td>
 </tr>
@@ -330,21 +332,21 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.principal</code></td>
-  <td>Framework principal to authenticate to Mesos</td>
+  <td>(none)</td>
   <td>
     Set the principal with which Spark framework will use to authenticate with Mesos.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.secret</code></td>
-  <td>Framework secret to authenticate to Mesos</td>
+  <td>(none)</td>
   <td>
     Set the secret with which Spark framework will use to authenticate with Mesos.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.role</code></td>
-  <td>Role for the Spark framework</td>
+  <td><code>*</code></td>
   <td>
     Set the role of this Spark framework for Mesos. Roles are used in Mesos for reservations
     and resource weight sharing.
@@ -352,7 +354,7 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.constraints</code></td>
-  <td>Attribute based constraints to be matched against when accepting resource offers.</td>
+  <td>(none)</td>
   <td>
     Attribute based constraints on mesos resource offers. By default, all resource offers will be accepted. Refer to <a href="http://mesos.apache.org/documentation/attributes-resources/">Mesos Attributes & Resources</a> for more information on attributes.
     <ul>
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 5159ef9e3394e..0e25ccf512c02 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -18,16 +18,16 @@ Spark application's configuration (driver, executors, and the AM when running in
 
 There are two deploy modes that can be used to launch Spark applications on YARN. In `yarn-cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `yarn-client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN.
 
-Unlike in Spark standalone and Mesos mode, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn-client` or `yarn-cluster`. 
+Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn-client` or `yarn-cluster`.
+
 To launch a Spark application in `yarn-cluster` mode:
 
-   `$ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] <app jar> [app options]`
-    
+    $ ./bin/spark-submit --class path.to.your.Class --master yarn-cluster [options] <app jar> [app options]
+
 For example:
 
     $ ./bin/spark-submit --class org.apache.spark.examples.SparkPi \
         --master yarn-cluster \
-        --num-executors 3 \
         --driver-memory 4g \
         --executor-memory 2g \
         --executor-cores 1 \
@@ -37,13 +37,13 @@ For example:
 
 The above starts a YARN client program which starts the default Application Master. Then SparkPi will be run as a child thread of Application Master. The client will periodically poll the Application Master for status updates and display them in the console. The client will exit once your application has finished running.  Refer to the "Debugging your Application" section below for how to see driver and executor logs.
 
-To launch a Spark application in `yarn-client` mode, do the same, but replace `yarn-cluster` with `yarn-client`.  To run spark-shell:
+To launch a Spark application in `yarn-client` mode, do the same, but replace `yarn-cluster` with `yarn-client`. The following shows how you can run `spark-shell` in `yarn-client` mode:
 
     $ ./bin/spark-shell --master yarn-client
 
 ## Adding Other JARs
 
-In `yarn-cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command. 
+In `yarn-cluster` mode, the driver runs on a different machine than the client, so `SparkContext.addJar` won't work out of the box with files that are local to the client. To make files on the client available to `SparkContext.addJar`, include them with the `--jars` option in the launch command.
 
     $ ./bin/spark-submit --class my.main.Class \
         --master yarn-cluster \
@@ -54,8 +54,8 @@ In `yarn-cluster` mode, the driver runs on a different machine than the client,
 
 # Preparations
 
-Running Spark-on-YARN requires a binary distribution of Spark which is built with YARN support.
-Binary distributions can be downloaded from the Spark project website. 
+Running Spark on YARN requires a binary distribution of Spark which is built with YARN support.
+Binary distributions can be downloaded from the [downloads page](http://spark.apache.org/downloads.html) of the project website.
 To build Spark yourself, refer to [Building Spark](building-spark.html).
 
 # Configuration
@@ -64,16 +64,16 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
 
 # Debugging your Application
 
-In YARN terminology, executors and application masters run inside "containers". YARN has two modes for handling container logs after an application has completed. If log aggregation is turned on (with the `yarn.log-aggregation-enable` config), container logs are copied to HDFS and deleted on the local machine. These logs can be viewed from anywhere on the cluster with the "yarn logs" command.
+In YARN terminology, executors and application masters run inside "containers". YARN has two modes for handling container logs after an application has completed. If log aggregation is turned on (with the `yarn.log-aggregation-enable` config), container logs are copied to HDFS and deleted on the local machine. These logs can be viewed from anywhere on the cluster with the `yarn logs` command.
 
     yarn logs -applicationId <app ID>
-    
+
 will print out the contents of all log files from all containers from the given application. You can also view the container log files directly in HDFS using the HDFS shell or API. The directory where they are located can be found by looking at your YARN configs (`yarn.nodemanager.remote-app-log-dir` and `yarn.nodemanager.remote-app-log-dir-suffix`). The logs are also available on the Spark Web UI under the Executors Tab. You need to have both the Spark history server and the MapReduce history server running and configure `yarn.log.server.url` in `yarn-site.xml` properly. The log URL on the Spark history server UI will redirect you to the MapReduce history server to show the aggregated logs.
 
 When log aggregation isn't turned on, logs are retained locally on each machine under `YARN_APP_LOGS_DIR`, which is usually configured to `/tmp/logs` or `$HADOOP_HOME/logs/userlogs` depending on the Hadoop version and installation. Viewing logs for a container requires going to the host that contains them and looking in this directory.  Subdirectories organize log files by application ID and container ID. The logs are also available on the Spark Web UI under the Executors Tab and doesn't require running the MapReduce history server.
 
 To review per-container launch environment, increase `yarn.nodemanager.delete.debug-delay-sec` to a
-large value (e.g. 36000), and then access the application cache through `yarn.nodemanager.local-dirs`
+large value (e.g. `36000`), and then access the application cache through `yarn.nodemanager.local-dirs`
 on the nodes on which containers are launched. This directory contains the launch script, JARs, and
 all environment variables used for launching each container. This process is useful for debugging
 classpath problems in particular. (Note that enabling this requires admin privileges on cluster
@@ -92,7 +92,7 @@ Note that for the first option, both executors and the application master will s
 log4j configuration, which may cause issues when they run on the same node (e.g. trying to write
 to the same log file).
 
-If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your log4j.properties. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming application, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log file, and logs can be accessed using YARN's log utility.
+If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your `log4j.properties`. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming applications, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log files, and logs can be accessed using YARN's log utility.
 
 #### Spark Properties
 
@@ -100,24 +100,26 @@ If you need a reference to the proper location to put log files in the YARN so t
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
   <td><code>spark.yarn.am.memory</code></td>
-  <td>512m</td>
+  <td><code>512m</code></td>
   <td>
     Amount of memory to use for the YARN Application Master in client mode, in the same format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>).
     In cluster mode, use <code>spark.driver.memory</code> instead.
+    <p/>
+    Use lower-case suffixes, e.g. <code>k</code>, <code>m</code>, <code>g</code>, <code>t</code>, and <code>p</code>, for kibi-, mebi-, gibi-, tebi-, and pebibytes, respectively.
   </td>
 </tr>
 <tr>
   <td><code>spark.driver.cores</code></td>
-  <td>1</td>
+  <td><code>1</code></td>
   <td>
     Number of cores used by the driver in YARN cluster mode.
-    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN AM.
-    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN AM instead.
+    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN Application Master.
+    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN Application Master instead.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.cores</code></td>
-  <td>1</td>
+  <td><code>1</code></td>
   <td>
     Number of cores to use for the YARN Application Master in client mode.
     In cluster mode, use <code>spark.driver.cores</code> instead.
@@ -125,39 +127,39 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.am.waitTime</code></td>
-  <td>100s</td>
+  <td><code>100s</code></td>
   <td>
-    In `yarn-cluster` mode, time for the application master to wait for the
-    SparkContext to be initialized. In `yarn-client` mode, time for the application master to wait
+    In <code>yarn-cluster</code> mode, time for the YARN Application Master to wait for the
+    SparkContext to be initialized. In <code>yarn-client</code> mode, time for the YARN Application Master to wait
     for the driver to connect to it.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.submit.file.replication</code></td>
-  <td>The default HDFS replication (usually 3)</td>
+  <td>The default HDFS replication (usually <code>3</code>)</td>
   <td>
     HDFS replication level for the files uploaded into HDFS for the application. These include things like the Spark jar, the app jar, and any distributed cache files/archives.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.preserve.staging.files</code></td>
-  <td>false</td>
+  <td><code>false</code></td>
   <td>
-    Set to true to preserve the staged files (Spark jar, app jar, distributed cache files) at the end of the job rather than delete them.
+    Set to <code>true</code> to preserve the staged files (Spark jar, app jar, distributed cache files) at the end of the job rather than delete them.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.scheduler.heartbeat.interval-ms</code></td>
-  <td>3000</td>
+  <td><code>3000</code></td>
   <td>
     The interval in ms in which the Spark application master heartbeats into the YARN ResourceManager.
-    The value is capped at half the value of YARN's configuration for the expiry interval
-    (<code>yarn.am.liveness-monitor.expiry-interval-ms</code>).
+    The value is capped at half the value of YARN's configuration for the expiry interval, i.e.
+    <code>yarn.am.liveness-monitor.expiry-interval-ms</code>.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.scheduler.initial-allocation.interval</code></td>
-  <td>200ms</td>
+  <td><code>200ms</code></td>
   <td>
     The initial interval in which the Spark application master eagerly heartbeats to the YARN ResourceManager
     when there are pending container allocation requests. It should be no larger than
@@ -177,8 +179,8 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td><code>spark.yarn.historyServer.address</code></td>
   <td>(none)</td>
   <td>
-    The address of the Spark history server (i.e. host.com:18080). The address should not contain a scheme (http://). Defaults to not being set since the history server is an optional service. This address is given to the YARN ResourceManager when the Spark application finishes to link the application from the ResourceManager UI to the Spark history server UI. 
-    For this property, YARN properties can be used as variables, and these are substituted by Spark at runtime. For eg, if the Spark history server runs on the same node as the YARN ResourceManager, it can be set to `${hadoopconf-yarn.resourcemanager.hostname}:18080`. 
+    The address of the Spark history server, e.g. <code>host.com:18080</code>. The address should not contain a scheme (<code>http://</code>). Defaults to not being set since the history server is an optional service. This address is given to the YARN ResourceManager when the Spark application finishes to link the application from the ResourceManager UI to the Spark history server UI.
+    For this property, YARN properties can be used as variables, and these are substituted by Spark at runtime. For example, if the Spark history server runs on the same node as the YARN ResourceManager, it can be set to <code>${hadoopconf-yarn.resourcemanager.hostname}:18080</code>.
   </td>
 </tr>
 <tr>
@@ -197,42 +199,42 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
  <td><code>spark.executor.instances</code></td>
-  <td>2</td>
+  <td><code>2</code></td>
   <td>
-    The number of executors. Note that this property is incompatible with <code>spark.dynamicAllocation.enabled</code>. If both <code>spark.dynamicAllocation.enabled</code> and <code>spark.executor.instances</code> are specified, dynamic allocation is turned off and the specified number of <code>spark.executor.instances</code> is used. 
+    The number of executors. Note that this property is incompatible with <code>spark.dynamicAllocation.enabled</code>. If both <code>spark.dynamicAllocation.enabled</code> and <code>spark.executor.instances</code> are specified, dynamic allocation is turned off and the specified number of <code>spark.executor.instances</code> is used.
   </td>
 </tr>
 <tr>
  <td><code>spark.yarn.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%).
+    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%).
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.driver.memoryOverhead</code></td>
-  <td>driverMemory * 0.07, with minimum of 384 </td>
+  <td>driverMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
+    The amount of off-heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.memoryOverhead</code></td>
-  <td>AM memory * 0.07, with minimum of 384 </td>
+  <td>AM memory * 0.10, with minimum of 384 </td>
   <td>
-    Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the Application Master in client mode.
+    Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the YARN Application Master in client mode.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.port</code></td>
   <td>(random)</td>
   <td>
-    Port for the YARN Application Master to listen on. In YARN client mode, this is used to communicate between the Spark driver running on a gateway and the Application Master running on YARN. In YARN cluster mode, this is used for the dynamic executor feature, where it handles the kill from the scheduler backend.
+    Port for the YARN Application Master to listen on. In YARN client mode, this is used to communicate between the Spark driver running on a gateway and the YARN Application Master running on YARN. In YARN cluster mode, this is used for the dynamic executor feature, where it handles the kill from the scheduler backend.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.queue</code></td>
-  <td>default</td>
+  <td><code>default</code></td>
   <td>
     The name of the YARN queue to which the application is submitted.
   </td>
@@ -245,18 +247,18 @@ If you need a reference to the proper location to put log files in the YARN so t
     By default, Spark on YARN will use a Spark jar installed locally, but the Spark jar can also be
     in a world-readable location on HDFS. This allows YARN to cache it on nodes so that it doesn't
     need to be distributed each time an application runs. To point to a jar on HDFS, for example,
-    set this configuration to "hdfs:///some/path".
+    set this configuration to <code>hdfs:///some/path</code>.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.access.namenodes</code></td>
   <td>(none)</td>
   <td>
-    A list of secure HDFS namenodes your Spark application is going to access. For 
-    example, `spark.yarn.access.namenodes=hdfs://nn1.com:8032,hdfs://nn2.com:8032`. 
-    The Spark application must have acess to the namenodes listed and Kerberos must 
-    be properly configured to be able to access them (either in the same realm or in 
-    a trusted realm). Spark acquires security tokens for each of the namenodes so that 
+    A comma-separated list of secure HDFS namenodes your Spark application is going to access. For
+    example, <code>spark.yarn.access.namenodes=hdfs://nn1.com:8032,hdfs://nn2.com:8032</code>.
+    The Spark application must have access to the namenodes listed and Kerberos must
+    be properly configured to be able to access them (either in the same realm or in
+    a trusted realm). Spark acquires security tokens for each of the namenodes so that
     the Spark application can access those remote HDFS clusters.
   </td>
 </tr>
@@ -264,18 +266,18 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td><code>spark.yarn.appMasterEnv.[EnvironmentVariableName]</code></td>
   <td>(none)</td>
   <td>
-     Add the environment variable specified by <code>EnvironmentVariableName</code> to the 
-     Application Master process launched on YARN. The user can specify multiple of 
-     these and to set multiple environment variables. In `yarn-cluster` mode this controls 
-     the environment of the SPARK driver and in `yarn-client` mode it only controls 
-     the environment of the executor launcher. 
+     Add the environment variable specified by <code>EnvironmentVariableName</code> to the
+     Application Master process launched on YARN. The user can specify multiple of
+     these and to set multiple environment variables. In <code>yarn-cluster</code> mode this controls
+     the environment of the Spark driver and in <code>yarn-client</code> mode it only controls
+     the environment of the executor launcher.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.containerLauncherMaxThreads</code></td>
-  <td>25</td>
+  <td><code>25</code></td>
   <td>
-    The maximum number of threads to use in the application master for launching executor containers.
+    The maximum number of threads to use in the YARN Application Master for launching executor containers.
   </td>
 </tr>
 <tr>
@@ -283,19 +285,19 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td>(none)</td>
   <td>
   A string of extra JVM options to pass to the YARN Application Master in client mode.
-  In cluster mode, use `spark.driver.extraJavaOptions` instead.
+  In cluster mode, use <code>spark.driver.extraJavaOptions</code> instead.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.am.extraLibraryPath</code></td>
   <td>(none)</td>
   <td>
-    Set a special library path to use when launching the application master in client mode.
+    Set a special library path to use when launching the YARN Application Master in client mode.
   </td>
 </tr>
 <tr>
   <td><code>spark.yarn.maxAppAttempts</code></td>
-  <td>yarn.resourcemanager.am.max-attempts in YARN</td>
+  <td><code>yarn.resourcemanager.am.max-attempts</code> in YARN</td>
   <td>
   The maximum number of attempts that will be made to submit the application.
   It should be no larger than the global number of max attempts in the YARN configuration.
@@ -303,10 +305,10 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.submit.waitAppCompletion</code></td>
-  <td>true</td>
+  <td><code>true</code></td>
   <td>
   In YARN cluster mode, controls whether the client waits to exit until the application completes.
-  If set to true, the client process will stay alive reporting the application's status.
+  If set to <code>true</code>, the client process will stay alive reporting the application's status.
   Otherwise, the client process will exit after submission.
   </td>
 </tr>
@@ -332,7 +334,7 @@ If you need a reference to the proper location to put log files in the YARN so t
   <td>(none)</td>
   <td>
   The full path to the file that contains the keytab for the principal specified above.
-  This keytab will be copied to the node running the Application Master via the Secure Distributed Cache,
+  This keytab will be copied to the node running the YARN Application Master via the Secure Distributed Cache,
   for renewing the login tickets and the delegation tokens periodically.
   </td>
 </tr>
@@ -371,14 +373,14 @@ If you need a reference to the proper location to put log files in the YARN so t
 </tr>
 <tr>
   <td><code>spark.yarn.security.tokens.${service}.enabled</code></td>
-  <td>true</td>
+  <td><code>true</code></td>
   <td>
   Controls whether to retrieve delegation tokens for non-HDFS services when security is enabled.
   By default, delegation tokens for all supported services are retrieved when those services are
   configured, but it's possible to disable that behavior if it somehow conflicts with the
   application being run.
   <p/>
-  Currently supported services are: hive, hbase
+  Currently supported services are: <code>hive</code>, <code>hbase</code>
   </td>
 </tr>
 </table>
@@ -387,5 +389,5 @@ If you need a reference to the proper location to put log files in the YARN so t
 
 - Whether core requests are honored in scheduling decisions depends on which scheduler is in use and how it is configured.
 - In `yarn-cluster` mode, the local directories used by the Spark executors and the Spark driver will be the local directories configured for YARN (Hadoop YARN config `yarn.nodemanager.local-dirs`). If the user specifies `spark.local.dir`, it will be ignored. In `yarn-client` mode, the Spark executors will use the local directories configured for YARN while the Spark driver will use those defined in `spark.local.dir`. This is because the Spark driver does not run on the YARN cluster in `yarn-client` mode, only the Spark executors do.
-- The `--files` and `--archives` options support specifying file names with the # similar to Hadoop. For example you can specify: `--files localtest.txt#appSees.txt` and this will upload the file you have locally named localtest.txt into HDFS but this will be linked to by the name `appSees.txt`, and your application should use the name as `appSees.txt` to reference it when running on YARN.
+- The `--files` and `--archives` options support specifying file names with the # similar to Hadoop. For example you can specify: `--files localtest.txt#appSees.txt` and this will upload the file you have locally named `localtest.txt` into HDFS but this will be linked to by the name `appSees.txt`, and your application should use the name as `appSees.txt` to reference it when running on YARN.
 - The `--jars` option allows the `SparkContext.addJar` function to work if you are using it with local files and running in `yarn-cluster` mode. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files.
diff --git a/docs/security.md b/docs/security.md
index d4ffa60e59a33..177109415180b 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -23,9 +23,16 @@ If your applications are using event logging, the directory where the event logs
 
 ## Encryption
 
-Spark supports SSL for Akka and HTTP (for broadcast and file server) protocols. However SSL is not supported yet for WebUI and block transfer service.
+Spark supports SSL for Akka and HTTP (for broadcast and file server) protocols. SASL encryption is
+supported for the block transfer service. Encryption is not yet supported for the WebUI.
 
-Connection encryption (SSL) configuration is organized hierarchically. The user can configure the default SSL settings which will be used for all the supported communication protocols unless they are overwritten by protocol-specific settings. This way the user can easily provide the common settings for all the protocols without disabling the ability to configure each one individually. The common SSL settings are at `spark.ssl` namespace in Spark configuration, while Akka SSL configuration is at `spark.ssl.akka` and HTTP for broadcast and file server SSL configuration is at `spark.ssl.fs`. The full breakdown can be found on the [configuration page](configuration.html).
+Encryption is not yet supported for data stored by Spark in temporary local storage, such as shuffle
+files, cached data, and other application files. If encrypting this data is desired, a workaround is
+to configure your cluster manager to store application data on encrypted disks.
+
+### SSL Configuration
+
+Configuration for SSL is organized hierarchically. The user can configure the default SSL settings which will be used for all the supported communication protocols unless they are overwritten by protocol-specific settings. This way the user can easily provide the common settings for all the protocols without disabling the ability to configure each one individually. The common SSL settings are at `spark.ssl` namespace in Spark configuration, while Akka SSL configuration is at `spark.ssl.akka` and HTTP for broadcast and file server SSL configuration is at `spark.ssl.fs`. The full breakdown can be found on the [configuration page](configuration.html).
 
 SSL must be configured on each node and configured for each component involved in communication using the particular protocol.
 
@@ -47,6 +54,17 @@ follows:
 * Import all exported public keys into a single trust-store
 * Distribute the trust-store over the nodes
 
+### Configuring SASL Encryption
+
+SASL encryption is currently supported for the block transfer service when authentication
+(`spark.authenticate`) is enabled. To enable SASL encryption for an application, set
+`spark.authenticate.enableSaslEncryption` to `true` in the application's configuration.
+
+When using an external shuffle service, it's possible to disable unencrypted connections by setting
+`spark.network.sasl.serverAlwaysEncrypt` to `true` in the shuffle service's configuration. If that
+option is enabled, applications that are not set up to use SASL encryption will fail to connect to
+the shuffle service.
+
 ## Configuring Ports for Network Security
 
 Spark makes heavy use of the network, and some environments have strict requirements for using tight
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index e8eb88488ee24..a1cbc7de97c65 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1405,16 +1405,6 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
     </p>
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.parquet.mergeSchema</code></td>
-  <td><code>false</code></td>
-  <td>
-    <p>
-      When true, the Parquet data source merges schemas collected from all data files, otherwise the
-      schema is picked from the summary file or a random data file if no summary file is available.
-    </p>
-  </td>
-</tr>
 </table>
 
 ## JSON Datasets
@@ -1686,7 +1676,7 @@ results <- collect(sql(sqlContext, "FROM src SELECT key, value"))
 ### Interacting with Different Versions of Hive Metastore
 
 One of the most important pieces of Spark SQL's Hive support is interaction with Hive metastore,
-which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary 
+which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary
 build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below.
 Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL
 will compile against Hive 1.2.1 and use those classes for internal execution (serdes, UDFs, UDAFs, etc).
@@ -1697,7 +1687,7 @@ The following options can be used to configure the version of Hive that is used
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
   <tr>
     <td><code>spark.sql.hive.metastore.version</code></td>
-    <td><code>0.13.1</code></td>
+    <td><code>1.2.1</code></td>
     <td>
       Version of the Hive metastore. Available
       options are <code>0.12.0</code> through <code>1.2.1</code>.
@@ -1716,8 +1706,8 @@ The following options can be used to configure the version of Hive that is used
         either <code>1.2.1</code> or not defined.
         <li><code>maven</code></li>
         Use Hive jars of specified version downloaded from Maven repositories.  This configuration
-        is not generally recommended for production deployments. 
-        <li>A classpath in the standard format for the JVM.  This classpath must include all of Hive 
+        is not generally recommended for production deployments.
+        <li>A classpath in the standard format for the JVM.  This classpath must include all of Hive
         and its dependencies, including the correct version of Hadoop.  These jars only need to be
         present on the driver, but if you are running in yarn cluster mode then you must ensure
         they are packaged with you application.</li>
@@ -1816,7 +1806,7 @@ the Data Sources API.  The following options are supported:
 <div data-lang="scala"  markdown="1">
 
 {% highlight scala %}
-val jdbcDF = sqlContext.read.format("jdbc").options( 
+val jdbcDF = sqlContext.read.format("jdbc").options(
   Map("url" -> "jdbc:postgresql:dbserver",
   "dbtable" -> "schema.tablename")).load()
 {% endhighlight %}
@@ -1946,13 +1936,6 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the number of partitions to use when shuffling data for joins or aggregations.
     </td>
   </tr>
-  <tr>
-    <td><code>spark.sql.planner.externalSort</code></td>
-    <td>true</td>
-    <td>
-      When true, performs sorts spilling to disk as needed otherwise sort each partition in memory.
-    </td>
-  </tr>
 </table>
 
 # Distributed SQL Engine
@@ -1964,7 +1947,7 @@ without the need to write any code.
 ## Running the Thrift JDBC/ODBC server
 
 The Thrift JDBC/ODBC server implemented here corresponds to the [`HiveServer2`](https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2)
-in Hive 0.13. You can test the JDBC server with the beeline script that comes with either Spark or Hive 0.13.
+in Hive 1.2.1 You can test the JDBC server with the beeline script that comes with either Spark or Hive 1.2.1.
 
 To start the JDBC/ODBC server, run the following in the Spark directory:
 
@@ -2040,11 +2023,11 @@ options.
 
  - Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with
    code generation for expression evaluation.  These features can both be disabled by setting
-   `spark.sql.tungsten.enabled` to `false.
- - Parquet schema merging is no longer enabled by default.  It can be re-enabled by setting 
+   `spark.sql.tungsten.enabled` to `false`.
+ - Parquet schema merging is no longer enabled by default.  It can be re-enabled by setting
    `spark.sql.parquet.mergeSchema` to `true`.
- - Resolution of strings to columns in python now supports using dots (`.`) to qualify the column or 
-   access nested values.  For example `df['table.column.nestedField']`.  However, this means that if 
+ - Resolution of strings to columns in python now supports using dots (`.`) to qualify the column or
+   access nested values.  For example `df['table.column.nestedField']`.  However, this means that if
    your column name contains any dots you must now escape them using backticks (e.g., ``table.`column.with.dots`.nested``).   
  - In-memory columnar storage partition pruning is on by default. It can be disabled by setting
    `spark.sql.inMemoryColumnarStorage.partitionPruning` to `false`.
@@ -2270,8 +2253,10 @@ Several caching related features are not supported yet:
 
 ## Compatibility with Apache Hive
 
-Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.  Currently Spark
-SQL is based on Hive 0.12.0 and 0.13.1.
+Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.
+Currently Hive SerDes and UDFs are based on Hive 1.2.1,
+and Spark SQL can be connected to different versions of Hive Metastore
+(from 0.12.0 to 1.2.1. Also see http://spark.apache.org/docs/latest/sql-programming-guide.html#interacting-with-different-versions-of-hive-metastore).
 
 #### Deploying in Existing Hive Warehouses
 
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index de0461010daec..383d954409ce4 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -5,8 +5,6 @@ title: Spark Streaming + Flume Integration Guide
 
 [Apache Flume](https://flume.apache.org/) is a distributed, reliable, and available service for efficiently collecting, aggregating, and moving large amounts of log data. Here we explain how to configure Flume and Spark Streaming to receive data from Flume. There are two approaches to this.
 
-<span class="badge" style="background-color: grey">Python API</span> Flume is not yet available in the Python API.
-
 ## Approach 1: Flume-style Push-based Approach
 Flume is designed to push data between Flume agents. In this approach, Spark Streaming essentially sets up a receiver that acts an Avro agent for Flume, to which Flume can push the data. Here are the configuration steps.
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 118ced298f4b0..c751dbb41785a 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -50,13 +50,7 @@ all of which are presented in this guide.
 You will find tabs throughout this guide that let you choose between code snippets of
 different languages.
 
-**Note:** Python API for Spark Streaming has been introduced in Spark 1.2. It has all the DStream
-transformations and almost all the output operations available in Scala and Java interfaces.
-However, it only has support for basic sources like text files and text data over sockets.
-APIs for additional sources, like Kafka and Flume, will be available in the future.
-Further information about available features in the Python API are mentioned throughout this
-document; look out for the tag
-<span class="badge" style="background-color: grey">Python API</span>.
+**Note:** There are a few APIs that are either different or not available in Python. Throughout this guide, you will find the tag <span class="badge" style="background-color: grey">Python API</span> highlighting these differences.
 
 ***************************************************************************************************
 
@@ -683,7 +677,7 @@ for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.strea
 {:.no_toc}
 
 <span class="badge" style="background-color: grey">Python API</span> As of Spark {{site.SPARK_VERSION_SHORT}},
-out of these sources, *only* Kafka, Flume and MQTT are available in the Python API. We will add more advanced sources in the Python API in future.
+out of these sources, Kafka, Kinesis, Flume and MQTT are available in the Python API.
 
 This category of sources require interfacing with external non-Spark libraries, some of them with
 complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts
@@ -725,9 +719,9 @@ Some of these advanced sources are as follows.
 
 - **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka 0.8.2.1. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details.
 
-- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.4.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
+- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.6.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
 
-- **Kinesis:** See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
+- **Kinesis:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kinesis Client Library 1.2.1. See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
 
 - **Twitter:** Spark Streaming's TwitterUtils uses Twitter4j 3.0.3 to get the public stream of tweets using
   [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information
@@ -1813,7 +1807,7 @@ To run a Spark Streaming applications, you need to have the following.
     + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this
       with Mesos.
 
-- *[Since Spark 1.2] Configuring write ahead logs* - Since Spark 1.2,
+- *Configuring write ahead logs* - Since Spark 1.2,
   we have introduced _write ahead logs_ for achieving strong
   fault-tolerance guarantees. If enabled,  all the data received from a receiver gets written into
   a write ahead log in the configuration checkpoint directory. This prevents data loss on driver
@@ -1828,6 +1822,17 @@ To run a Spark Streaming applications, you need to have the following.
   stored in a replicated storage system. This can be done by setting the storage level for the
   input stream to `StorageLevel.MEMORY_AND_DISK_SER`.
 
+- *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming
+  application to process data as fast as it is being received, the receivers can be rate limited
+  by setting a maximum rate limit in terms of records / sec.
+  See the [configuration parameters](configuration.html#spark-streaming)
+  `spark.streaming.receiver.maxRate` for receivers and `spark.streaming.kafka.maxRatePerPartition`
+  for Direct Kafka approach. In Spark 1.5, we have introduced a feature called *backpressure* that
+  eliminate the need to set this rate limit, as Spark Streaming automatically figures out the
+  rate limits and dynamically adjusts them if the processing conditions change. This backpressure
+  can be enabled by setting the [configuration parameter](configuration.html#spark-streaming)
+  `spark.streaming.backpressure.enabled` to `true`.
+
 ### Upgrading Application Code
 {:.no_toc}
 
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index e58645274e525..915be0f479157 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -65,8 +65,8 @@ For Python applications, simply pass a `.py` file in the place of `<application-
 and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`.
 
 There are a few options available that are specific to the
-[cluster manager](#cluster-overview.html#cluster-manager-types) that is being used.
-For example, with a [Spark Standalone](#spark-standalone) cluster with `cluster` deploy mode,
+[cluster manager](cluster-overview.html#cluster-manager-types) that is being used.
+For example, with a [Spark standalone cluster](spark-standalone.html) with `cluster` deploy mode,
 you can also specify `--supervise` to make sure that the driver is automatically restarted if it
 fails with non-zero exit code. To enumerate all such options available to `spark-submit`,
 run it with `--help`. Here are a few examples of common options:
@@ -79,7 +79,7 @@ run it with `--help`. Here are a few examples of common options:
   /path/to/examples.jar \
   100
 
-# Run on a Spark Standalone cluster in client deploy mode
+# Run on a Spark standalone cluster in client deploy mode
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
   --master spark://207.184.161.138:7077 \
@@ -88,7 +88,7 @@ run it with `--help`. Here are a few examples of common options:
   /path/to/examples.jar \
   1000
 
-# Run on a Spark Standalone cluster in cluster deploy mode with supervise
+# Run on a Spark standalone cluster in cluster deploy mode with supervise
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
   --master spark://207.184.161.138:7077 \
@@ -103,13 +103,13 @@ run it with `--help`. Here are a few examples of common options:
 export HADOOP_CONF_DIR=XXX
 ./bin/spark-submit \
   --class org.apache.spark.examples.SparkPi \
-  --master yarn-cluster \  # can also be `yarn-client` for client mode
+  --master yarn-cluster \  # can also be yarn-client for client mode
   --executor-memory 20G \
   --num-executors 50 \
   /path/to/examples.jar \
   1000
 
-# Run a Python application on a Spark Standalone cluster
+# Run a Python application on a Spark standalone cluster
 ./bin/spark-submit \
   --master spark://207.184.161.138:7077 \
   examples/src/main/python/pi.py \
@@ -174,9 +174,9 @@ This can use up a significant amount of space over time and will need to be clea
 is handled automatically, and with Spark standalone, automatic cleanup can be configured with the
 `spark.worker.cleanup.appDataTtl` property.
 
-Users may also include any other dependencies by supplying a comma-delimited list of maven coordinates 
-with `--packages`. All transitive dependencies will be handled when using this command. Additional 
-repositories (or resolvers in SBT) can be added in a comma-delimited fashion with the flag `--repositories`. 
+Users may also include any other dependencies by supplying a comma-delimited list of maven coordinates
+with `--packages`. All transitive dependencies will be handled when using this command. Additional
+repositories (or resolvers in SBT) can be added in a comma-delimited fashion with the flag `--repositories`.
 These commands can be used with `pyspark`, `spark-shell`, and `spark-submit` to include Spark Packages.
 
 For Python, the equivalent `--py-files` option can be used to distribute `.egg`, `.zip` and `.py` libraries
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 11fd7ee0ec8df..3a2361c6d6d2b 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.4.0"
+SPARK_EC2_VERSION = "1.5.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -71,6 +71,8 @@
     "1.3.0",
     "1.3.1",
     "1.4.0",
+    "1.4.1",
+    "1.5.0"
 ])
 
 SPARK_TACHYON_MAP = {
@@ -84,6 +86,8 @@
     "1.3.0": "0.5.0",
     "1.3.1": "0.5.0",
     "1.4.0": "0.6.4",
+    "1.4.1": "0.6.4",
+    "1.5.0": "0.7.1"
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
@@ -91,7 +95,7 @@
 
 # Default location to get the spark-ec2 scripts (and ami-list) from
 DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2"
-DEFAULT_SPARK_EC2_BRANCH = "branch-1.4"
+DEFAULT_SPARK_EC2_BRANCH = "branch-1.5"
 
 
 def setup_external_libs(libs):
diff --git a/examples/pom.xml b/examples/pom.xml
index e6884b09dca94..f5ab2a7fdc098 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index a377694507d29..0b4c0d9ba9f8b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -219,6 +219,11 @@ public Vector predictRaw(Vector features) {
    */
   public int numClasses() { return 2; }
 
+  /**
+   * Number of features the model was trained on.
+   */
+  public int numFeatures() { return weights_.size(); }
+
   /**
    * Create a copy of the model.
    * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
new file mode 100644
index 0000000000000..23f834ab4332b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.evaluation.RegressionEvaluator;
+import org.apache.spark.ml.param.ParamMap;
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.tuning.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+/**
+ * A simple example demonstrating model selection using TrainValidationSplit.
+ *
+ * The example is based on {@link org.apache.spark.examples.ml.JavaSimpleParamsExample}
+ * using linear regression.
+ *
+ * Run with
+ * {{{
+ * bin/run-example ml.JavaTrainValidationSplitExample
+ * }}}
+ */
+public class JavaTrainValidationSplitExample {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaTrainValidationSplitExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    DataFrame data = jsql.createDataFrame(
+      MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
+      LabeledPoint.class);
+
+    // Prepare training and test data.
+    DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
+    DataFrame training = splits[0];
+    DataFrame test = splits[1];
+
+    LinearRegression lr = new LinearRegression();
+
+    // We use a ParamGridBuilder to construct a grid of parameters to search over.
+    // TrainValidationSplit will try all combinations of values and determine best model using
+    // the evaluator.
+    ParamMap[] paramGrid = new ParamGridBuilder()
+      .addGrid(lr.regParam(), new double[] {0.1, 0.01})
+      .addGrid(lr.fitIntercept())
+      .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
+      .build();
+
+    // In this case the estimator is simply the linear regression.
+    // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+    TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
+      .setEstimator(lr)
+      .setEvaluator(new RegressionEvaluator())
+      .setEstimatorParamMaps(paramGrid);
+
+    // 80% of the data will be used for training and the remaining 20% for validation.
+    trainValidationSplit.setTrainRatio(0.8);
+
+    // Run train validation split, and choose the best set of parameters.
+    TrainValidationSplitModel model = trainValidationSplit.fit(training);
+
+    // Make predictions on test data. model is the model with combination of parameters
+    // that performed best.
+    model.transform(test)
+      .select("features", "label", "prediction")
+      .show();
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
index fa07c1e5017cd..d1b9b8d398dd8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
@@ -16,6 +16,7 @@
  */
 
  // scalastyle:off println
+ // scalastyle:off jobcontext
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
@@ -81,6 +82,7 @@ object CassandraCQLTest {
 
     val job = new Job()
     job.setInputFormatClass(classOf[CqlPagingInputFormat])
+    val configuration = job.getConfiguration
     ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost)
     ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort)
     ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily)
@@ -135,3 +137,4 @@ object CassandraCQLTest {
   }
 }
 // scalastyle:on println
+// scalastyle:on jobcontext
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
index 2e56d24c60c33..1e679bfb55343 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
@@ -16,6 +16,7 @@
  */
 
 // scalastyle:off println
+// scalastyle:off jobcontext
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
@@ -130,6 +131,7 @@ object CassandraTest {
   }
 }
 // scalastyle:on println
+// scalastyle:on jobcontext
 
 /*
 create keyspace casDemo;
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 340c3559b15ef..3758edc56198a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -172,6 +172,9 @@ private class MyLogisticRegressionModel(
   /** Number of classes the label can take.  2 indicates binary classification. */
   override val numClasses: Int = 2
 
+  /** Number of features the model was trained on. */
+  override val numFeatures: Int = weights.size
+
   /**
    * Create a copy of the model.
    * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
new file mode 100644
index 0000000000000..1abdf219b1c00
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * A simple example demonstrating model selection using TrainValidationSplit.
+ *
+ * The example is based on [[SimpleParamsExample]] using linear regression.
+ * Run with
+ * {{{
+ * bin/run-example ml.TrainValidationSplitExample
+ * }}}
+ */
+object TrainValidationSplitExample {
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("TrainValidationSplitExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+
+    // Prepare training and test data.
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+    val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
+
+    val lr = new LinearRegression()
+
+    // We use a ParamGridBuilder to construct a grid of parameters to search over.
+    // TrainValidationSplit will try all combinations of values and determine best model using
+    // the evaluator.
+    val paramGrid = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.1, 0.01))
+      .addGrid(lr.fitIntercept, Array(true, false))
+      .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
+      .build()
+
+    // In this case the estimator is simply the linear regression.
+    // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
+    val trainValidationSplit = new TrainValidationSplit()
+      .setEstimator(lr)
+      .setEvaluator(new RegressionEvaluator)
+      .setEstimatorParamMaps(paramGrid)
+
+    // 80% of the data will be used for training and the remaining 20% for validation.
+    trainValidationSplit.setTrainRatio(0.8)
+
+    // Run train validation split, and choose the best set of parameters.
+    val model = trainValidationSplit.fit(training)
+
+    // Make predictions on test data. model is the model with combination of parameters
+    // that performed best.
+    model.transform(test)
+      .select("features", "label", "prediction")
+      .show()
+
+    sc.stop()
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
new file mode 100644
index 0000000000000..ab29f90254d34
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingTestExample.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.mllib.stat.test.StreamingTest
+import org.apache.spark.streaming.{Seconds, StreamingContext}
+import org.apache.spark.util.Utils
+
+/**
+ * Perform streaming testing using Welch's 2-sample t-test on a stream of data, where the data
+ * stream arrives as text files in a directory. Stops when the two groups are statistically
+ * significant (p-value < 0.05) or after a user-specified timeout in number of batches is exceeded.
+ *
+ * The rows of the text files must be in the form `Boolean, Double`. For example:
+ *   false, -3.92
+ *   true, 99.32
+ *
+ * Usage:
+ *   StreamingTestExample <dataDir> <batchDuration> <numBatchesTimeout>
+ *
+ * To run on your local machine using the directory `dataDir` with 5 seconds between each batch and
+ * a timeout after 100 insignificant batches, call:
+ *    $ bin/run-example mllib.StreamingTestExample dataDir 5 100
+ *
+ * As you add text files to `dataDir` the significance test wil continually update every
+ * `batchDuration` seconds until the test becomes significant (p-value < 0.05) or the number of
+ * batches processed exceeds `numBatchesTimeout`.
+ */
+object StreamingTestExample {
+
+  def main(args: Array[String]) {
+    if (args.length != 3) {
+      // scalastyle:off println
+      System.err.println(
+        "Usage: StreamingTestExample " +
+          "<dataDir> <batchDuration> <numBatchesTimeout>")
+      // scalastyle:on println
+      System.exit(1)
+    }
+    val dataDir = args(0)
+    val batchDuration = Seconds(args(1).toLong)
+    val numBatchesTimeout = args(2).toInt
+
+    val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample")
+    val ssc = new StreamingContext(conf, batchDuration)
+    ssc.checkpoint({
+      val dir = Utils.createTempDir()
+      dir.toString
+    })
+
+    val data = ssc.textFileStream(dataDir).map(line => line.split(",") match {
+      case Array(label, value) => (label.toBoolean, value.toDouble)
+    })
+
+    val streamingTest = new StreamingTest()
+      .setPeacePeriod(0)
+      .setWindowSize(0)
+      .setTestMethod("welch")
+
+    val out = streamingTest.registerStream(data)
+    out.print()
+
+    // Stop processing if test becomes significant or we time out
+    var timeoutCounter = numBatchesTimeout
+    out.foreachRDD { rdd =>
+      timeoutCounter -= 1
+      val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _)
+      if (timeoutCounter == 0 || anySignificant) rdd.context.stop()
+    }
+
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 561ed4babe5d0..dceedcf23ed5b 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 0664cfb2021e1..d7c2ac474a18d 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 14f7daaf417e0..132062f94fb45 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
index 2bf99cb3cba1f..c8780aa83bdbd 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeInputDStream.scala
@@ -43,7 +43,7 @@ import org.jboss.netty.handler.codec.compression._
 
 private[streaming]
 class FlumeInputDStream[T: ClassTag](
-  @transient ssc_ : StreamingContext,
+  ssc_ : StreamingContext,
   host: String,
   port: Int,
   storageLevel: StorageLevel,
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 0bc46209b8369..3b936d88abd3e 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -46,7 +46,7 @@ import org.apache.spark.streaming.flume.sink._
  * @tparam T Class type of the object of this stream
  */
 private[streaming] class FlumePollingInputDStream[T: ClassTag](
-    @transient _ssc: StreamingContext,
+    _ssc: StreamingContext,
     val addresses: Seq[InetSocketAddress],
     val maxBatchSize: Int,
     val parallelism: Int,
diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index 6f4e2a89e9af7..a9ed39ef8c9a0 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index ded863bd985e8..05abd9e2e6810 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index 1000094e93cb3..8a087474d3169 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -58,7 +58,7 @@ class DirectKafkaInputDStream[
   U <: Decoder[K]: ClassTag,
   T <: Decoder[V]: ClassTag,
   R: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     val kafkaParams: Map[String, String],
     val fromOffsets: Map[TopicAndPartition, Long],
     messageHandler: MessageAndMetadata[K, V] => R
@@ -79,7 +79,7 @@ class DirectKafkaInputDStream[
   override protected[streaming] val rateController: Option[RateController] = {
     if (RateController.isBackPressureEnabled(ssc.conf)) {
       Some(new DirectKafkaRateController(id,
-        RateEstimator.create(ssc.conf, ssc_.graph.batchDuration)))
+        RateEstimator.create(ssc.conf, context.graph.batchDuration)))
     } else {
       None
     }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index 04b2dc10d39ea..38730fecf332a 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -48,7 +48,7 @@ class KafkaInputDStream[
   V: ClassTag,
   U <: Decoder[_]: ClassTag,
   T <: Decoder[_]: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     kafkaParams: Map[String, String],
     topics: Map[String, Int],
     useReliableReceiver: Boolean,
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
index 9db07d0507fea..fbdfbf7e509b3 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
@@ -75,11 +75,11 @@ public void testKafkaStream() throws InterruptedException {
     String[] topic1data = createTopicAndSendData(topic1);
     String[] topic2data = createTopicAndSendData(topic2);
 
-    HashSet<String> sent = new HashSet<String>();
+    Set<String> sent = new HashSet<>();
     sent.addAll(Arrays.asList(topic1data));
     sent.addAll(Arrays.asList(topic2data));
 
-    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    Map<String, String> kafkaParams = new HashMap<>();
     kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
     kafkaParams.put("auto.offset.reset", "smallest");
 
@@ -95,17 +95,17 @@ public void testKafkaStream() throws InterruptedException {
         // Make sure you can get offset ranges from the rdd
         new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() {
           @Override
-          public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) throws Exception {
+          public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) {
             OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
             offsetRanges.set(offsets);
-            Assert.assertEquals(offsets[0].topic(), topic1);
+            Assert.assertEquals(topic1, offsets[0].topic());
             return rdd;
           }
         }
     ).map(
         new Function<Tuple2<String, String>, String>() {
           @Override
-          public String call(Tuple2<String, String> kv) throws Exception {
+          public String call(Tuple2<String, String> kv) {
             return kv._2();
           }
         }
@@ -119,10 +119,10 @@ public String call(Tuple2<String, String> kv) throws Exception {
         StringDecoder.class,
         String.class,
         kafkaParams,
-        topicOffsetToMap(topic2, (long) 0),
+        topicOffsetToMap(topic2, 0L),
         new Function<MessageAndMetadata<String, String>, String>() {
           @Override
-          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+          public String call(MessageAndMetadata<String, String> msgAndMd) {
             return msgAndMd.message();
           }
         }
@@ -133,7 +133,7 @@ public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception
     unifiedStream.foreachRDD(
         new Function<JavaRDD<String>, Void>() {
           @Override
-          public Void call(JavaRDD<String> rdd) throws Exception {
+          public Void call(JavaRDD<String> rdd) {
             result.addAll(rdd.collect());
             for (OffsetRange o : offsetRanges.get()) {
               System.out.println(
@@ -155,14 +155,14 @@ public Void call(JavaRDD<String> rdd) throws Exception {
     ssc.stop();
   }
 
-  private HashSet<String> topicToSet(String topic) {
-    HashSet<String> topicSet = new HashSet<String>();
+  private static Set<String> topicToSet(String topic) {
+    Set<String> topicSet = new HashSet<>();
     topicSet.add(topic);
     return topicSet;
   }
 
-  private HashMap<TopicAndPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
-    HashMap<TopicAndPartition, Long> topicMap = new HashMap<TopicAndPartition, Long>();
+  private static Map<TopicAndPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
+    Map<TopicAndPartition, Long> topicMap = new HashMap<>();
     topicMap.put(new TopicAndPartition(topic, 0), offsetToStart);
     return topicMap;
   }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
index a9dc6e50613ca..afcc6cfccd39a 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 import java.util.HashMap;
+import java.util.Map;
 
 import scala.Tuple2;
 
@@ -66,10 +67,10 @@ public void testKafkaRDD() throws InterruptedException {
     String topic1 = "topic1";
     String topic2 = "topic2";
 
-    String[] topic1data = createTopicAndSendData(topic1);
-    String[] topic2data = createTopicAndSendData(topic2);
+    createTopicAndSendData(topic1);
+    createTopicAndSendData(topic2);
 
-    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    Map<String, String> kafkaParams = new HashMap<>();
     kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
 
     OffsetRange[] offsetRanges = {
@@ -77,8 +78,8 @@ public void testKafkaRDD() throws InterruptedException {
       OffsetRange.create(topic2, 0, 0, 1)
     };
 
-    HashMap<TopicAndPartition, Broker> emptyLeaders = new HashMap<TopicAndPartition, Broker>();
-    HashMap<TopicAndPartition, Broker> leaders = new HashMap<TopicAndPartition, Broker>();
+    Map<TopicAndPartition, Broker> emptyLeaders = new HashMap<>();
+    Map<TopicAndPartition, Broker> leaders = new HashMap<>();
     String[] hostAndPort = kafkaTestUtils.brokerAddress().split(":");
     Broker broker = Broker.create(hostAndPort[0], Integer.parseInt(hostAndPort[1]));
     leaders.put(new TopicAndPartition(topic1, 0), broker);
@@ -95,7 +96,7 @@ public void testKafkaRDD() throws InterruptedException {
     ).map(
         new Function<Tuple2<String, String>, String>() {
           @Override
-          public String call(Tuple2<String, String> kv) throws Exception {
+          public String call(Tuple2<String, String> kv) {
             return kv._2();
           }
         }
@@ -113,7 +114,7 @@ public String call(Tuple2<String, String> kv) throws Exception {
         emptyLeaders,
         new Function<MessageAndMetadata<String, String>, String>() {
           @Override
-          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+          public String call(MessageAndMetadata<String, String> msgAndMd) {
             return msgAndMd.message();
           }
         }
@@ -131,7 +132,7 @@ public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception
         leaders,
         new Function<MessageAndMetadata<String, String>, String>() {
           @Override
-          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+          public String call(MessageAndMetadata<String, String> msgAndMd) {
             return msgAndMd.message();
           }
         }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
index e4c659215b767..1e69de46cd35d 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
@@ -67,10 +67,10 @@ public void tearDown() {
   @Test
   public void testKafkaStream() throws InterruptedException {
     String topic = "topic1";
-    HashMap<String, Integer> topics = new HashMap<String, Integer>();
+    Map<String, Integer> topics = new HashMap<>();
     topics.put(topic, 1);
 
-    HashMap<String, Integer> sent = new HashMap<String, Integer>();
+    Map<String, Integer> sent = new HashMap<>();
     sent.put("a", 5);
     sent.put("b", 3);
     sent.put("c", 10);
@@ -78,7 +78,7 @@ public void testKafkaStream() throws InterruptedException {
     kafkaTestUtils.createTopic(topic);
     kafkaTestUtils.sendMessages(topic, sent);
 
-    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    Map<String, String> kafkaParams = new HashMap<>();
     kafkaParams.put("zookeeper.connect", kafkaTestUtils.zkAddress());
     kafkaParams.put("group.id", "test-consumer-" + random.nextInt(10000));
     kafkaParams.put("auto.offset.reset", "smallest");
@@ -97,7 +97,7 @@ public void testKafkaStream() throws InterruptedException {
     JavaDStream<String> words = stream.map(
       new Function<Tuple2<String, String>, String>() {
         @Override
-        public String call(Tuple2<String, String> tuple2) throws Exception {
+        public String call(Tuple2<String, String> tuple2) {
           return tuple2._2();
         }
       }
@@ -106,7 +106,7 @@ public String call(Tuple2<String, String> tuple2) throws Exception {
     words.countByValue().foreachRDD(
       new Function<JavaPairRDD<String, Long>, Void>() {
         @Override
-        public Void call(JavaPairRDD<String, Long> rdd) throws Exception {
+        public Void call(JavaPairRDD<String, Long> rdd) {
           List<Tuple2<String, Long>> ret = rdd.collect();
           for (Tuple2<String, Long> r : ret) {
             if (result.containsKey(r._1())) {
@@ -130,8 +130,8 @@ public Void call(JavaPairRDD<String, Long> rdd) throws Exception {
       Thread.sleep(200);
     }
     Assert.assertEquals(sent.size(), result.size());
-    for (String k : sent.keySet()) {
-      Assert.assertEquals(sent.get(k).intValue(), result.get(k).intValue());
+    for (Map.Entry<String, Integer> e : sent.entrySet()) {
+      Assert.assertEquals(e.getValue().intValue(), result.get(e.getKey()).intValue());
     }
   }
 }
diff --git a/external/mqtt-assembly/pom.xml b/external/mqtt-assembly/pom.xml
index 8412600633734..89713a28ca6a8 100644
--- a/external/mqtt-assembly/pom.xml
+++ b/external/mqtt-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 69b309876a0db..05e6338a08b0a 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
index 7c2f18cb35bda..116c170489e96 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.streaming.receiver.Receiver
 
 private[streaming]
 class MQTTInputDStream(
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     brokerUrl: String,
     topic: String,
     storageLevel: StorageLevel
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 178ae8de13b57..244ad58ae9593 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
index 7cf02d85d73d3..d7de74b350543 100644
--- a/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
+++ b/external/twitter/src/main/scala/org/apache/spark/streaming/twitter/TwitterInputDStream.scala
@@ -39,7 +39,7 @@ import org.apache.spark.streaming.receiver.Receiver
 */
 private[streaming]
 class TwitterInputDStream(
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     twitterAuth: Option[Authorization],
     filters: Seq[String],
     storageLevel: StorageLevel
diff --git a/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java b/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java
index e46b4e5c7531d..26ec8af455bcf 100644
--- a/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java
+++ b/external/twitter/src/test/java/org/apache/spark/streaming/twitter/JavaTwitterStreamSuite.java
@@ -17,8 +17,6 @@
 
 package org.apache.spark.streaming.twitter;
 
-import java.util.Arrays;
-
 import org.junit.Test;
 import twitter4j.Status;
 import twitter4j.auth.Authorization;
@@ -30,7 +28,7 @@
 public class JavaTwitterStreamSuite extends LocalJavaStreamingContext {
   @Test
   public void testTwitterStream() {
-    String[] filters = (String[])Arrays.<String>asList("filter1", "filter2").toArray();
+    String[] filters = { "filter1", "filter2" };
     Authorization auth = NullAuthorization.getInstance();
 
     // tests the API, does not actually test data receiving
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 37bfd10d43663..171df8682c848 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 3636a9037d43f..81794a8536318 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java b/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
index 729bc0459ce52..14975265ab2ce 100644
--- a/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
+++ b/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
@@ -77,7 +77,7 @@ public void call(String s) {
   public void foreach() {
     foreachCalls = 0;
     JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World"));
-    rdd.foreach((x) -> foreachCalls++);
+    rdd.foreach(x -> foreachCalls++);
     Assert.assertEquals(2, foreachCalls);
   }
 
@@ -180,7 +180,7 @@ public void map() {
     JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x))
       .cache();
     pairs.collect();
-    JavaRDD<String> strings = rdd.map(x -> x.toString()).cache();
+    JavaRDD<String> strings = rdd.map(Object::toString).cache();
     strings.collect();
   }
 
@@ -195,7 +195,9 @@ public void flatMap() {
 
     JavaPairRDD<String, String> pairs = rdd.flatMapToPair(s -> {
       List<Tuple2<String, String>> pairs2 = new LinkedList<>();
-      for (String word : s.split(" ")) pairs2.add(new Tuple2<>(word, word));
+      for (String word : s.split(" ")) {
+        pairs2.add(new Tuple2<>(word, word));
+      }
       return pairs2;
     });
 
@@ -204,11 +206,12 @@ public void flatMap() {
 
     JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> {
       List<Double> lengths = new LinkedList<>();
-      for (String word : s.split(" ")) lengths.add(word.length() * 1.0);
+      for (String word : s.split(" ")) {
+        lengths.add((double) word.length());
+      }
       return lengths;
     });
 
-    Double x = doubles.first();
     Assert.assertEquals(5.0, doubles.first(), 0.01);
     Assert.assertEquals(11, pairs.count());
   }
@@ -228,7 +231,7 @@ public void mapsFromPairsToPairs() {
     swapped.collect();
 
     // There was never a bug here, but it's worth testing:
-    pairRDD.map(item -> item.swap()).collect();
+    pairRDD.map(Tuple2::swap).collect();
   }
 
   @Test
@@ -282,11 +285,11 @@ public void zipPartitions() {
     FlatMapFunction2<Iterator<Integer>, Iterator<String>, Integer> sizesFn =
       (Iterator<Integer> i, Iterator<String> s) -> {
         int sizeI = 0;
-        int sizeS = 0;
         while (i.hasNext()) {
           sizeI += 1;
           i.next();
         }
+        int sizeS = 0;
         while (s.hasNext()) {
           sizeS += 1;
           s.next();
@@ -301,30 +304,31 @@ public void zipPartitions() {
   public void accumulators() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 
-    final Accumulator<Integer> intAccum = sc.intAccumulator(10);
-    rdd.foreach(x -> intAccum.add(x));
+    Accumulator<Integer> intAccum = sc.intAccumulator(10);
+    rdd.foreach(intAccum::add);
     Assert.assertEquals((Integer) 25, intAccum.value());
 
-    final Accumulator<Double> doubleAccum = sc.doubleAccumulator(10.0);
+    Accumulator<Double> doubleAccum = sc.doubleAccumulator(10.0);
     rdd.foreach(x -> doubleAccum.add((double) x));
     Assert.assertEquals((Double) 25.0, doubleAccum.value());
 
     // Try a custom accumulator type
     AccumulatorParam<Float> floatAccumulatorParam = new AccumulatorParam<Float>() {
+      @Override
       public Float addInPlace(Float r, Float t) {
         return r + t;
       }
-
+      @Override
       public Float addAccumulator(Float r, Float t) {
         return r + t;
       }
-
+      @Override
       public Float zero(Float initialValue) {
         return 0.0f;
       }
     };
 
-    final Accumulator<Float> floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
+    Accumulator<Float> floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
     rdd.foreach(x -> floatAccum.add((float) x));
     Assert.assertEquals((Float) 25.0f, floatAccum.value());
 
@@ -336,7 +340,7 @@ public Float zero(Float initialValue) {
   @Test
   public void keyBy() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
-    List<Tuple2<String, Integer>> s = rdd.keyBy(x -> x.toString()).collect();
+    List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
     Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
     Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
   }
@@ -349,7 +353,7 @@ public void mapOnPairRDD() {
     JavaPairRDD<Integer, Integer> rdd3 =
       rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
     Assert.assertEquals(Arrays.asList(
-      new Tuple2<Integer, Integer>(1, 1),
+      new Tuple2<>(1, 1),
       new Tuple2<>(0, 2),
       new Tuple2<>(1, 3),
       new Tuple2<>(0, 4)), rdd3.collect());
@@ -361,7 +365,7 @@ public void collectPartitions() {
 
     JavaPairRDD<Integer, Integer> rdd2 =
       rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
-    List[] parts = rdd1.collectPartitions(new int[]{0});
+    List<Integer>[] parts = rdd1.collectPartitions(new int[]{0});
     Assert.assertEquals(Arrays.asList(1, 2), parts[0]);
 
     parts = rdd1.collectPartitions(new int[]{1, 2});
@@ -371,19 +375,19 @@ public void collectPartitions() {
     Assert.assertEquals(Arrays.asList(new Tuple2<>(1, 1), new Tuple2<>(2, 0)),
       rdd2.collectPartitions(new int[]{0})[0]);
 
-    parts = rdd2.collectPartitions(new int[]{1, 2});
-    Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1), new Tuple2<>(4, 0)), parts[0]);
+    List<Tuple2<Integer, Integer>>[] parts2 = rdd2.collectPartitions(new int[]{1, 2});
+    Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1), new Tuple2<>(4, 0)), parts2[0]);
     Assert.assertEquals(Arrays.asList(new Tuple2<>(5, 1), new Tuple2<>(6, 0), new Tuple2<>(7, 1)),
-      parts[1]);
+      parts2[1]);
   }
 
   @Test
   public void collectAsMapWithIntArrayValues() {
     // Regression test for SPARK-1040
-    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(new Integer[]{1}));
+    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1));
     JavaPairRDD<Integer, int[]> pairRDD =
       rdd.mapToPair(x -> new Tuple2<>(x, new int[]{x}));
     pairRDD.collect();  // Works fine
-    Map<Integer, int[]> map = pairRDD.collectAsMap();  // Used to crash with ClassCastException
+    pairRDD.collectAsMap();  // Used to crash with ClassCastException
   }
 }
diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml
index 51af3e6f2225f..61ba4787fbf90 100644
--- a/extras/kinesis-asl-assembly/pom.xml
+++ b/extras/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 521b53e230c4a..6dd8ff69c2943 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml
index 478d0019a25f0..87a4f05a05961 100644
--- a/extras/spark-ganglia-lgpl/pom.xml
+++ b/extras/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 853dea9a7795e..202fc19002d12 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/TripletFields.java b/graphx/src/main/java/org/apache/spark/graphx/TripletFields.java
similarity index 100%
rename from graphx/src/main/scala/org/apache/spark/graphx/TripletFields.java
rename to graphx/src/main/java/org/apache/spark/graphx/TripletFields.java
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeActiveness.java b/graphx/src/main/java/org/apache/spark/graphx/impl/EdgeActiveness.java
similarity index 100%
rename from graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeActiveness.java
rename to graphx/src/main/java/org/apache/spark/graphx/impl/EdgeActiveness.java
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
index 4611a3ace219b..ee7302a1edbf6 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
@@ -38,8 +38,8 @@ import org.apache.spark.graphx.impl.EdgeRDDImpl
  * `impl.ReplicatedVertexView`.
  */
 abstract class EdgeRDD[ED](
-    @transient sc: SparkContext,
-    @transient deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) {
+    sc: SparkContext,
+    deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) {
 
   // scalastyle:off structural.type
   private[graphx] def partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])] forSome { type VD }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index db73a8abc5733..869caa340f52b 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -46,7 +46,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * @note vertex ids are unique.
    * @return an RDD containing the vertices in this graph
    */
-  @transient val vertices: VertexRDD[VD]
+  val vertices: VertexRDD[VD]
 
   /**
    * An RDD containing the edges and their associated attributes.  The entries in the RDD contain
@@ -59,7 +59,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * along with their vertex data.
    *
    */
-  @transient val edges: EdgeRDD[ED]
+  val edges: EdgeRDD[ED]
 
   /**
    * An RDD containing the edge triplets, which are edges along with the vertex data associated with
@@ -77,7 +77,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * val numInvalid = graph.triplets.map(e => if (e.src.data == e.dst.data) 1 else 0).sum
    * }}}
    */
-  @transient val triplets: RDD[EdgeTriplet[VD, ED]]
+  val triplets: RDD[EdgeTriplet[VD, ED]]
 
   /**
    * Caches the vertices and edges associated with this graph at the specified storage level,
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
index a9f04b559c3d1..1ef7a78fbcd00 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
@@ -55,8 +55,8 @@ import org.apache.spark.graphx.impl.VertexRDDImpl
  * @tparam VD the vertex attribute associated with each vertex in the set.
  */
 abstract class VertexRDD[VD](
-    @transient sc: SparkContext,
-    @transient deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {
+    sc: SparkContext,
+    deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {
 
   implicit protected def vdTag: ClassTag[VD]
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
index eb3c997e0f3c0..4f1260a5a67b2 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala
@@ -34,7 +34,7 @@ object RoutingTablePartition {
   /**
    * A message from an edge partition to a vertex specifying the position in which the edge
    * partition references the vertex (src, dst, or both). The edge partition is encoded in the lower
-   * 30 bytes of the Int, and the position is encoded in the upper 2 bytes of the Int.
+   * 30 bits of the Int, and the position is encoded in the upper 2 bits of the Int.
    */
   type RoutingTableMessage = (VertexId, Int)
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 2fd768d8119c4..ed38e66aa2467 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/licenses/LICENSE-AnchorJS.txt b/licenses/LICENSE-AnchorJS.txt
new file mode 100644
index 0000000000000..2bf24b9b9f848
--- /dev/null
+++ b/licenses/LICENSE-AnchorJS.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-DPark.txt b/licenses/LICENSE-DPark.txt
new file mode 100644
index 0000000000000..1d916090e4ea0
--- /dev/null
+++ b/licenses/LICENSE-DPark.txt
@@ -0,0 +1,30 @@
+Copyright (c) 2011, Douban Inc. <http://www.douban.com/>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+    * Neither the name of the Douban Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-Mockito.txt b/licenses/LICENSE-Mockito.txt
new file mode 100644
index 0000000000000..e0840a446caf5
--- /dev/null
+++ b/licenses/LICENSE-Mockito.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2007 Mockito contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-SnapTree.txt b/licenses/LICENSE-SnapTree.txt
new file mode 100644
index 0000000000000..a538825d89ec5
--- /dev/null
+++ b/licenses/LICENSE-SnapTree.txt
@@ -0,0 +1,35 @@
+SNAPTREE LICENSE
+
+Copyright (c) 2009-2012 Stanford University, unless otherwise specified.
+All rights reserved.
+
+This software was developed by the Pervasive Parallelism Laboratory of
+Stanford University, California, USA.
+
+Permission to use, copy, modify, and distribute this software in source
+or binary form for any purpose with or without fee is hereby granted,
+provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of Stanford University nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/licenses/LICENSE-antlr.txt b/licenses/LICENSE-antlr.txt
new file mode 100644
index 0000000000000..3021ea04332ed
--- /dev/null
+++ b/licenses/LICENSE-antlr.txt
@@ -0,0 +1,8 @@
+[The BSD License]
+Copyright (c) 2012 Terence Parr and Sam Harwell
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-boto.txt b/licenses/LICENSE-boto.txt
new file mode 100644
index 0000000000000..7bba0cd9e10a4
--- /dev/null
+++ b/licenses/LICENSE-boto.txt
@@ -0,0 +1,20 @@
+Copyright (c) 2006-2008 Mitch Garnaat http://garnaat.org/
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, dis-
+tribute, sublicense, and/or sell copies of the Software, and to permit
+persons to whom the Software is furnished to do so, subject to the fol-
+lowing conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-cloudpickle.txt b/licenses/LICENSE-cloudpickle.txt
new file mode 100644
index 0000000000000..b1e20fa1eda88
--- /dev/null
+++ b/licenses/LICENSE-cloudpickle.txt
@@ -0,0 +1,28 @@
+Copyright (c) 2012, Regents of the University of California.
+Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the University of California, Berkeley nor the
+      names of its contributors may be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-d3.min.js.txt b/licenses/LICENSE-d3.min.js.txt
new file mode 100644
index 0000000000000..c71e3f254c068
--- /dev/null
+++ b/licenses/LICENSE-d3.min.js.txt
@@ -0,0 +1,26 @@
+Copyright (c) 2010-2015, Michael Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* The name Michael Bostock may not be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-dagre-d3.txt b/licenses/LICENSE-dagre-d3.txt
new file mode 100644
index 0000000000000..4864fe05e9803
--- /dev/null
+++ b/licenses/LICENSE-dagre-d3.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-f2j.txt b/licenses/LICENSE-f2j.txt
new file mode 100644
index 0000000000000..e28fd3ccdfa69
--- /dev/null
+++ b/licenses/LICENSE-f2j.txt
@@ -0,0 +1,8 @@
+Copyright © 2015 The University of Tennessee. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+· Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+· Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution.
+· Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.
\ No newline at end of file
diff --git a/licenses/LICENSE-graphlib-dot.txt b/licenses/LICENSE-graphlib-dot.txt
new file mode 100644
index 0000000000000..c9e18cd562423
--- /dev/null
+++ b/licenses/LICENSE-graphlib-dot.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2012-2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-heapq.txt b/licenses/LICENSE-heapq.txt
new file mode 100644
index 0000000000000..0c4c4b954bea4
--- /dev/null
+++ b/licenses/LICENSE-heapq.txt
@@ -0,0 +1,280 @@
+
+# A. HISTORY OF THE SOFTWARE
+# ==========================
+#
+# Python was created in the early 1990s by Guido van Rossum at Stichting
+# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+# as a successor of a language called ABC.  Guido remains Python's
+# principal author, although it includes many contributions from others.
+#
+# In 1995, Guido continued his work on Python at the Corporation for
+#     National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+# in Reston, Virginia where he released several versions of the
+# software.
+#
+# In May 2000, Guido and the Python core development team moved to
+# BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+# year, the PythonLabs team moved to Digital Creations (now Zope
+# Corporation, see http://www.zope.com).  In 2001, the Python Software
+# Foundation (PSF, see http://www.python.org/psf/) was formed, a
+# non-profit organization created specifically to own Python-related
+# Intellectual Property.  Zope Corporation is a sponsoring member of
+# the PSF.
+#
+# All Python releases are Open Source (see http://www.opensource.org for
+# the Open Source Definition).  Historically, most, but not all, Python
+# releases have also been GPL-compatible; the table below summarizes
+# the various releases.
+#
+# Release         Derived     Year        Owner       GPL-
+# from                                compatible? (1)
+#
+# 0.9.0 thru 1.2              1991-1995   CWI         yes
+# 1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+# 1.6             1.5.2       2000        CNRI        no
+# 2.0             1.6         2000        BeOpen.com  no
+# 1.6.1           1.6         2001        CNRI        yes (2)
+# 2.1             2.0+1.6.1   2001        PSF         no
+# 2.0.1           2.0+1.6.1   2001        PSF         yes
+# 2.1.1           2.1+2.0.1   2001        PSF         yes
+# 2.2             2.1.1       2001        PSF         yes
+# 2.1.2           2.1.1       2002        PSF         yes
+# 2.1.3           2.1.2       2002        PSF         yes
+# 2.2.1           2.2         2002        PSF         yes
+# 2.2.2           2.2.1       2002        PSF         yes
+# 2.2.3           2.2.2       2003        PSF         yes
+# 2.3             2.2.2       2002-2003   PSF         yes
+# 2.3.1           2.3         2002-2003   PSF         yes
+# 2.3.2           2.3.1       2002-2003   PSF         yes
+# 2.3.3           2.3.2       2002-2003   PSF         yes
+# 2.3.4           2.3.3       2004        PSF         yes
+# 2.3.5           2.3.4       2005        PSF         yes
+# 2.4             2.3         2004        PSF         yes
+# 2.4.1           2.4         2005        PSF         yes
+# 2.4.2           2.4.1       2005        PSF         yes
+# 2.4.3           2.4.2       2006        PSF         yes
+# 2.4.4           2.4.3       2006        PSF         yes
+# 2.5             2.4         2006        PSF         yes
+# 2.5.1           2.5         2007        PSF         yes
+# 2.5.2           2.5.1       2008        PSF         yes
+# 2.5.3           2.5.2       2008        PSF         yes
+# 2.6             2.5         2008        PSF         yes
+# 2.6.1           2.6         2008        PSF         yes
+# 2.6.2           2.6.1       2009        PSF         yes
+# 2.6.3           2.6.2       2009        PSF         yes
+# 2.6.4           2.6.3       2009        PSF         yes
+# 2.6.5           2.6.4       2010        PSF         yes
+# 2.7             2.6         2010        PSF         yes
+#
+# Footnotes:
+#
+# (1) GPL-compatible doesn't mean that we're distributing Python under
+# the GPL.  All Python licenses, unlike the GPL, let you distribute
+# a modified version without making your changes open source.  The
+# GPL-compatible licenses make it possible to combine Python with
+#     other software that is released under the GPL; the others don't.
+#
+# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+# because its license has a choice of law clause.  According to
+# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+# is "not incompatible" with the GPL.
+#
+# Thanks to the many outside volunteers who have worked under Guido's
+# direction to make these releases possible.
+#
+#
+# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+# ===============================================================
+#
+# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+# --------------------------------------------
+#
+# 1. This LICENSE AGREEMENT is between the Python Software Foundation
+# ("PSF"), and the Individual or Organization ("Licensee") accessing and
+# otherwise using this software ("Python") in source or binary form and
+# its associated documentation.
+#
+# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
+# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+# analyze, test, perform and/or display publicly, prepare derivative works,
+# distribute, and otherwise use Python alone or in any derivative version,
+# provided, however, that PSF's License Agreement and PSF's notice of copyright,
+# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained
+# in Python alone or in any derivative version prepared by Licensee.
+#
+# 3. In the event Licensee prepares a derivative work that is based on
+# or incorporates Python or any part thereof, and wants to make
+# the derivative work available to others as provided herein, then
+# Licensee hereby agrees to include in any such work a brief summary of
+# the changes made to Python.
+#
+# 4. PSF is making Python available to Licensee on an "AS IS"
+# basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 7. Nothing in this License Agreement shall be deemed to create any
+# relationship of agency, partnership, or joint venture between PSF and
+# Licensee.  This License Agreement does not grant permission to use PSF
+# trademarks or trade name in a trademark sense to endorse or promote
+# products or services of Licensee, or any third party.
+#
+# 8. By copying, installing or otherwise using Python, Licensee
+# agrees to be bound by the terms and conditions of this License
+# Agreement.
+#
+#
+# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+# -------------------------------------------
+#
+# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+#
+# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+# Individual or Organization ("Licensee") accessing and otherwise using
+# this software in source or binary form and its associated
+# documentation ("the Software").
+#
+# 2. Subject to the terms and conditions of this BeOpen Python License
+# Agreement, BeOpen hereby grants Licensee a non-exclusive,
+# royalty-free, world-wide license to reproduce, analyze, test, perform
+# and/or display publicly, prepare derivative works, distribute, and
+# otherwise use the Software alone or in any derivative version,
+# provided, however, that the BeOpen Python License is retained in the
+# Software, alone or in any derivative version prepared by Licensee.
+#
+# 3. BeOpen is making the Software available to Licensee on an "AS IS"
+# basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 5. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 6. This License Agreement shall be governed by and interpreted in all
+# respects by the law of the State of California, excluding conflict of
+# law provisions.  Nothing in this License Agreement shall be deemed to
+# create any relationship of agency, partnership, or joint venture
+# between BeOpen and Licensee.  This License Agreement does not grant
+# permission to use BeOpen trademarks or trade names in a trademark
+# sense to endorse or promote products or services of Licensee, or any
+# third party.  As an exception, the "BeOpen Python" logos available at
+# http://www.pythonlabs.com/logos.html may be used according to the
+# permissions granted on that web page.
+#
+# 7. By copying, installing or otherwise using the software, Licensee
+# agrees to be bound by the terms and conditions of this License
+# Agreement.
+#
+#
+# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+# ---------------------------------------
+#
+# 1. This LICENSE AGREEMENT is between the Corporation for National
+#     Research Initiatives, having an office at 1895 Preston White Drive,
+# Reston, VA 20191 ("CNRI"), and the Individual or Organization
+# ("Licensee") accessing and otherwise using Python 1.6.1 software in
+# source or binary form and its associated documentation.
+#
+# 2. Subject to the terms and conditions of this License Agreement, CNRI
+# hereby grants Licensee a nonexclusive, royalty-free, world-wide
+# license to reproduce, analyze, test, perform and/or display publicly,
+# prepare derivative works, distribute, and otherwise use Python 1.6.1
+# alone or in any derivative version, provided, however, that CNRI's
+# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+# 1995-2001 Corporation for National Research Initiatives; All Rights
+# Reserved" are retained in Python 1.6.1 alone or in any derivative
+# version prepared by Licensee.  Alternately, in lieu of CNRI's License
+# Agreement, Licensee may substitute the following text (omitting the
+# quotes): "Python 1.6.1 is made available subject to the terms and
+# conditions in CNRI's License Agreement.  This Agreement together with
+# Python 1.6.1 may be located on the Internet using the following
+# unique, persistent identifier (known as a handle): 1895.22/1013.  This
+# Agreement may also be obtained from a proxy server on the Internet
+# using the following URL: http://hdl.handle.net/1895.22/1013".
+#
+# 3. In the event Licensee prepares a derivative work that is based on
+# or incorporates Python 1.6.1 or any part thereof, and wants to make
+# the derivative work available to others as provided herein, then
+# Licensee hereby agrees to include in any such work a brief summary of
+# the changes made to Python 1.6.1.
+#
+# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+# basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+# IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+# INFRINGE ANY THIRD PARTY RIGHTS.
+#
+# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+#
+# 6. This License Agreement will automatically terminate upon a material
+# breach of its terms and conditions.
+#
+# 7. This License Agreement shall be governed by the federal
+# intellectual property law of the United States, including without
+# limitation the federal copyright law, and, to the extent such
+# U.S. federal law does not apply, by the law of the Commonwealth of
+# Virginia, excluding Virginia's conflict of law provisions.
+# Notwithstanding the foregoing, with regard to derivative works based
+# on Python 1.6.1 that incorporate non-separable material that was
+# previously distributed under the GNU General Public License (GPL), the
+# law of the Commonwealth of Virginia shall govern this License
+# Agreement only as to issues arising under or with respect to
+# Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+# License Agreement shall be deemed to create any relationship of
+# agency, partnership, or joint venture between CNRI and Licensee.  This
+# License Agreement does not grant permission to use CNRI trademarks or
+# trade name in a trademark sense to endorse or promote products or
+# services of Licensee, or any third party.
+#
+# 8. By clicking on the "ACCEPT" button where indicated, or by copying,
+# installing or otherwise using Python 1.6.1, Licensee agrees to be
+# bound by the terms and conditions of this License Agreement.
+#
+# ACCEPT
+#
+#
+# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+# --------------------------------------------------
+#
+# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+# The Netherlands.  All rights reserved.
+#
+# Permission to use, copy, modify, and distribute this software and its
+# documentation for any purpose and without fee is hereby granted,
+# provided that the above copyright notice appear in all copies and that
+# both that copyright notice and this permission notice appear in
+# supporting documentation, and that the name of Stichting Mathematisch
+# Centrum or CWI not be used in advertising or publicity pertaining to
+# distribution of the software without specific, written prior
+# permission.
+#
+# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-javolution.txt b/licenses/LICENSE-javolution.txt
new file mode 100644
index 0000000000000..b64af4d8298aa
--- /dev/null
+++ b/licenses/LICENSE-javolution.txt
@@ -0,0 +1,27 @@
+/*
+ * Javolution - Java(tm) Solution for Real-Time and Embedded Systems
+ * Copyright (c) 2012, Javolution (http://javolution.org/)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
\ No newline at end of file
diff --git a/licenses/LICENSE-jbcrypt.txt b/licenses/LICENSE-jbcrypt.txt
new file mode 100644
index 0000000000000..d332534c06356
--- /dev/null
+++ b/licenses/LICENSE-jbcrypt.txt
@@ -0,0 +1,17 @@
+jBCrypt is subject to the following license:
+
+/*
+ * Copyright (c) 2006 Damien Miller <djm@mindrot.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
diff --git a/licenses/LICENSE-jblas.txt b/licenses/LICENSE-jblas.txt
new file mode 100644
index 0000000000000..5629dafb65b39
--- /dev/null
+++ b/licenses/LICENSE-jblas.txt
@@ -0,0 +1,31 @@
+Copyright (c) 2009, Mikio L. Braun and contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * Neither the name of the Technische Universität Berlin nor the
+      names of its contributors may be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jline.txt b/licenses/LICENSE-jline.txt
new file mode 100644
index 0000000000000..2ec539d10ac54
--- /dev/null
+++ b/licenses/LICENSE-jline.txt
@@ -0,0 +1,32 @@
+Copyright (c) 2002-2006, Marc Prud'hommeaux <mwp1@cornell.edu>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the following
+conditions are met:
+
+Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with
+the distribution.
+
+Neither the name of JLine nor the names of its contributors
+may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
+BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jpmml-model.txt b/licenses/LICENSE-jpmml-model.txt
new file mode 100644
index 0000000000000..69411d1c6e9a8
--- /dev/null
+++ b/licenses/LICENSE-jpmml-model.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2009, University of Tartu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-jquery.txt b/licenses/LICENSE-jquery.txt
new file mode 100644
index 0000000000000..e1dd696d3b6cc
--- /dev/null
+++ b/licenses/LICENSE-jquery.txt
@@ -0,0 +1,9 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-junit-interface.txt b/licenses/LICENSE-junit-interface.txt
new file mode 100644
index 0000000000000..e835350c4e2a4
--- /dev/null
+++ b/licenses/LICENSE-junit-interface.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2009-2012, Stefan Zeiger
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-kryo.txt b/licenses/LICENSE-kryo.txt
new file mode 100644
index 0000000000000..3f6a160c238e5
--- /dev/null
+++ b/licenses/LICENSE-kryo.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008, Nathan Sweet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-minlog.txt b/licenses/LICENSE-minlog.txt
new file mode 100644
index 0000000000000..3f6a160c238e5
--- /dev/null
+++ b/licenses/LICENSE-minlog.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008, Nathan Sweet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-netlib.txt b/licenses/LICENSE-netlib.txt
new file mode 100644
index 0000000000000..75783ed6bc357
--- /dev/null
+++ b/licenses/LICENSE-netlib.txt
@@ -0,0 +1,49 @@
+Copyright (c) 2013 Samuel Halliday
+Copyright (c) 1992-2011 The University of Tennessee and The University
+                        of Tennessee Research Foundation.  All rights
+                        reserved.
+Copyright (c) 2000-2011 The University of California Berkeley. All
+                        rights reserved.
+Copyright (c) 2006-2011 The University of Colorado Denver.  All rights
+                        reserved.
+
+$COPYRIGHT$
+
+Additional copyrights may follow
+
+$HEADER$
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer listed
+  in this license in the documentation and/or other materials
+  provided with the distribution.
+
+- Neither the name of the copyright holders nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+The copyright holders provide no reassurances that the source code
+provided does not infringe any patent, copyright, or any other
+intellectual property rights of third parties.  The copyright holders
+disclaim any liability to any recipient for claims brought against
+recipient by any third party for infringement of that parties
+intellectual property rights.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-paranamer.txt b/licenses/LICENSE-paranamer.txt
new file mode 100644
index 0000000000000..fca18473ba03f
--- /dev/null
+++ b/licenses/LICENSE-paranamer.txt
@@ -0,0 +1,28 @@
+[ ParaNamer used to be 'Pubic Domain', but since it includes a small piece of ASM it is now the same license as that: BSD ]
+
+ Copyright (c) 2006 Paul Hammant & ThoughtWorks Inc
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. Neither the name of the copyright holders nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-protobuf.txt b/licenses/LICENSE-protobuf.txt
new file mode 100644
index 0000000000000..b4350ec83c758
--- /dev/null
+++ b/licenses/LICENSE-protobuf.txt
@@ -0,0 +1,42 @@
+This license applies to all parts of Protocol Buffers except the following:
+
+  - Atomicops support for generic gcc, located in
+    src/google/protobuf/stubs/atomicops_internals_generic_gcc.h.
+    This file is copyrighted by Red Hat Inc.
+
+  - Atomicops support for AIX/POWER, located in
+    src/google/protobuf/stubs/atomicops_internals_aix.h.
+    This file is copyrighted by Bloomberg Finance LP.
+
+Copyright 2014, Google Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Code generated by the Protocol Buffer compiler is owned by the owner
+of the input file used when generating it.  This code is not
+standalone and requires a support library to be linked with it.  This
+support library is itself covered by the above license.
\ No newline at end of file
diff --git a/licenses/LICENSE-py4j.txt b/licenses/LICENSE-py4j.txt
new file mode 100644
index 0000000000000..70af3e69ed67a
--- /dev/null
+++ b/licenses/LICENSE-py4j.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- The name of the author may not be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/licenses/LICENSE-pyrolite.txt b/licenses/LICENSE-pyrolite.txt
new file mode 100644
index 0000000000000..9457c7aa66140
--- /dev/null
+++ b/licenses/LICENSE-pyrolite.txt
@@ -0,0 +1,28 @@
+
+Pyro - Python Remote Objects
+Software License, copyright, and disclaimer
+
+  Pyro is Copyright (c) by Irmen de Jong (irmen@razorvine.net).
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+
+
+This is the "MIT Software License" which is OSI-certified, and GPL-compatible.
+See http://www.opensource.org/licenses/mit-license.php
+
diff --git a/licenses/LICENSE-reflectasm.txt b/licenses/LICENSE-reflectasm.txt
new file mode 100644
index 0000000000000..3f6a160c238e5
--- /dev/null
+++ b/licenses/LICENSE-reflectasm.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008, Nathan Sweet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-sbt-launch-lib.txt b/licenses/LICENSE-sbt-launch-lib.txt
new file mode 100644
index 0000000000000..3b9156baaab78
--- /dev/null
+++ b/licenses/LICENSE-sbt-launch-lib.txt
@@ -0,0 +1,26 @@
+// Generated from http://www.opensource.org/licenses/bsd-license.php
+Copyright (c) 2011, Paul Phillips.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the author nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-scala.txt b/licenses/LICENSE-scala.txt
new file mode 100644
index 0000000000000..4846076aba246
--- /dev/null
+++ b/licenses/LICENSE-scala.txt
@@ -0,0 +1,30 @@
+Copyright (c) 2002-2013 EPFL
+Copyright (c) 2011-2013 Typesafe, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+- Neither the name of the EPFL nor the names of its contributors may be
+  used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE-scalacheck.txt b/licenses/LICENSE-scalacheck.txt
new file mode 100644
index 0000000000000..cb8f97842f4c4
--- /dev/null
+++ b/licenses/LICENSE-scalacheck.txt
@@ -0,0 +1,32 @@
+ScalaCheck LICENSE
+
+Copyright (c) 2007-2015, Rickard Nilsson
+All rights reserved.
+
+Permission to use, copy, modify, and distribute this software in source
+or binary form for any purpose with or without fee is hereby granted,
+provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of the author nor the names of its contributors
+      may be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
\ No newline at end of file
diff --git a/licenses/LICENSE-scopt.txt b/licenses/LICENSE-scopt.txt
new file mode 100644
index 0000000000000..2bf24b9b9f848
--- /dev/null
+++ b/licenses/LICENSE-scopt.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-slf4j.txt b/licenses/LICENSE-slf4j.txt
new file mode 100644
index 0000000000000..6548cd3af4322
--- /dev/null
+++ b/licenses/LICENSE-slf4j.txt
@@ -0,0 +1,21 @@
+Copyright (c) 2004-2013 QOS.ch
+ All rights reserved.
+
+ Permission is hereby granted, free  of charge, to any person obtaining
+ a  copy  of this  software  and  associated  documentation files  (the
+ "Software"), to  deal in  the Software without  restriction, including
+ without limitation  the rights to  use, copy, modify,  merge, publish,
+ distribute,  sublicense, and/or sell  copies of  the Software,  and to
+ permit persons to whom the Software  is furnished to do so, subject to
+ the following conditions:
+
+ The  above  copyright  notice  and  this permission  notice  shall  be
+ included in all copies or substantial portions of the Software.
+
+ THE  SOFTWARE IS  PROVIDED  "AS  IS", WITHOUT  WARRANTY  OF ANY  KIND,
+ EXPRESS OR  IMPLIED, INCLUDING  BUT NOT LIMITED  TO THE  WARRANTIES OF
+ MERCHANTABILITY,    FITNESS    FOR    A   PARTICULAR    PURPOSE    AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE,  ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-sorttable.js.txt b/licenses/LICENSE-sorttable.js.txt
new file mode 100644
index 0000000000000..b31a5b206bf40
--- /dev/null
+++ b/licenses/LICENSE-sorttable.js.txt
@@ -0,0 +1,16 @@
+Copyright (c) 1997-2007 Stuart Langridge
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/licenses/LICENSE-spire.txt b/licenses/LICENSE-spire.txt
new file mode 100644
index 0000000000000..40af7746b9315
--- /dev/null
+++ b/licenses/LICENSE-spire.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2011-2012 Erik Osheim, Tom Switzer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-xmlenc.txt b/licenses/LICENSE-xmlenc.txt
new file mode 100644
index 0000000000000..3a70c9bfcdadd
--- /dev/null
+++ b/licenses/LICENSE-xmlenc.txt
@@ -0,0 +1,27 @@
+Copyright 2003-2005, Ernst de Haan <wfe.dehaan@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/make-distribution.sh b/make-distribution.sh
index 04ad0052eb24c..62c0ba6df7d3f 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -198,6 +198,7 @@ fi
 
 # Copy license and ASF files
 cp "$SPARK_HOME/LICENSE" "$DISTDIR"
+cp -r "$SPARK_HOME/licenses" "$DISTDIR"
 cp "$SPARK_HOME/NOTICE" "$DISTDIR"
 
 if [ -e "$SPARK_HOME"/CHANGES.txt ]; then
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a5db14407b4fc..22c0c6008ba37 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
new file mode 100644
index 0000000000000..f632dd603c449
--- /dev/null
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -0,0 +1 @@
+org.apache.spark.ml.source.libsvm.DefaultSource
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index 19fe039b8fd03..e0dcd427fae24 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.SchemaUtils
@@ -145,6 +145,10 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType,
   /** @group setParam */
   def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M]
 
+  /** Returns the number of features the model was trained on. If unknown, returns -1 */
+  @Since("1.6.0")
+  def numFeatures: Int = -1
+
   /**
    * Returns the SQL DataType corresponding to the FeaturesType type parameter.
    *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 457c15830fd38..2c29eeb01a921 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -183,6 +183,8 @@ class AttributeGroup private (
     sum = 37 * sum + attributes.map(_.toSeq).hashCode
     sum
   }
+
+  override def toString: String = toMetadata.toString
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index e479f169021d8..a7c10333c0d53 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -124,18 +124,28 @@ private[attribute] trait AttributeFactory {
   private[attribute] def fromMetadata(metadata: Metadata): Attribute
 
   /**
-   * Creates an [[Attribute]] from a [[StructField]] instance.
+   * Creates an [[Attribute]] from a [[StructField]] instance, optionally preserving name.
    */
-  def fromStructField(field: StructField): Attribute = {
+  private[ml] def decodeStructField(field: StructField, preserveName: Boolean): Attribute = {
     require(field.dataType.isInstanceOf[NumericType])
     val metadata = field.metadata
     val mlAttr = AttributeKeys.ML_ATTR
     if (metadata.contains(mlAttr)) {
-      fromMetadata(metadata.getMetadata(mlAttr)).withName(field.name)
+      val attr = fromMetadata(metadata.getMetadata(mlAttr))
+      if (preserveName) {
+        attr
+      } else {
+        attr.withName(field.name)
+      }
     } else {
       UnresolvedAttribute
     }
   }
+
+  /**
+   * Creates an [[Attribute]] from a [[StructField]] instance.
+   */
+  def fromStructField(field: StructField): Attribute = decodeStructField(field, false)
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 6f70b96b17ec6..b0157f7ce24ec 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -106,6 +106,7 @@ object DecisionTreeClassifier {
 final class DecisionTreeClassificationModel private[ml] (
     override val uid: String,
     override val rootNode: Node,
+    override val numFeatures: Int,
     override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel]
   with DecisionTreeModel with Serializable {
@@ -117,8 +118,8 @@ final class DecisionTreeClassificationModel private[ml] (
    * Construct a decision tree classification model.
    * @param rootNode  Root node of tree, with other nodes attached.
    */
-  private[ml] def this(rootNode: Node, numClasses: Int) =
-    this(Identifiable.randomUID("dtc"), rootNode, numClasses)
+  private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) =
+    this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses)
 
   override protected def predict(features: Vector): Double = {
     rootNode.predictImpl(features).prediction
@@ -140,12 +141,12 @@ final class DecisionTreeClassificationModel private[ml] (
   }
 
   override def copy(extra: ParamMap): DecisionTreeClassificationModel = {
-    copyValues(new DecisionTreeClassificationModel(uid, rootNode, numClasses), extra)
+    copyValues(new DecisionTreeClassificationModel(uid, rootNode, numFeatures, numClasses), extra)
       .setParent(parent)
   }
 
   override def toString: String = {
-    s"DecisionTreeClassificationModel of depth $depth with $numNodes nodes"
+    s"DecisionTreeClassificationModel (uid=$uid) of depth $depth with $numNodes nodes"
   }
 
   /** (private[ml]) Convert to a model in the old API */
@@ -160,12 +161,14 @@ private[ml] object DecisionTreeClassificationModel {
   def fromOld(
       oldModel: OldDecisionTreeModel,
       parent: DecisionTreeClassifier,
-      categoricalFeatures: Map[Int, Int]): DecisionTreeClassificationModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): DecisionTreeClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification,
       s"Cannot convert non-classification DecisionTreeModel (old API) to" +
         s" DecisionTreeClassificationModel (new API).  Algo is: ${oldModel.algo}")
     val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc")
-    new DecisionTreeClassificationModel(uid, rootNode, -1)
+    // Can't infer number of features from old model, so default to -1
+    new DecisionTreeClassificationModel(uid, rootNode, numFeatures, -1)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index 3073a2a61ce83..74aef94bf7675 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -33,7 +33,7 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.loss.{LogLoss => OldLogLoss, Loss => OldLoss}
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{Row, DataFrame}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.DoubleType
 
@@ -138,10 +138,11 @@ final class GBTClassifier(override val uid: String)
     require(numClasses == 2,
       s"GBTClassifier only supports binary classification but was given numClasses = $numClasses")
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
     val oldGBT = new OldGBT(boostingStrategy)
     val oldModel = oldGBT.run(oldDataset)
-    GBTClassificationModel.fromOld(oldModel, this, categoricalFeatures)
+    GBTClassificationModel.fromOld(oldModel, this, categoricalFeatures, numFeatures)
   }
 
   override def copy(extra: ParamMap): GBTClassifier = defaultCopy(extra)
@@ -164,10 +165,11 @@ object GBTClassifier {
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
 @Experimental
-final class GBTClassificationModel(
+final class GBTClassificationModel private[ml](
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
-    private val _treeWeights: Array[Double])
+    private val _treeWeights: Array[Double],
+    override val numFeatures: Int)
   extends PredictionModel[Vector, GBTClassificationModel]
   with TreeEnsembleModel with Serializable {
 
@@ -175,6 +177,14 @@ final class GBTClassificationModel(
   require(_trees.length == _treeWeights.length, "GBTClassificationModel given trees, treeWeights" +
     s" of non-matching lengths (${_trees.length}, ${_treeWeights.length}, respectively).")
 
+  /**
+   * Construct a GBTClassificationModel
+   * @param _trees  Decision trees in the ensemble.
+   * @param _treeWeights  Weights for the decision trees in the ensemble.
+   */
+  def this(uid: String, _trees: Array[DecisionTreeRegressionModel], _treeWeights: Array[Double]) =
+    this(uid, _trees, _treeWeights, -1)
+
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
   override def treeWeights: Array[Double] = _treeWeights
@@ -196,11 +206,12 @@ final class GBTClassificationModel(
   }
 
   override def copy(extra: ParamMap): GBTClassificationModel = {
-    copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra).setParent(parent)
+    copyValues(new GBTClassificationModel(uid, _trees, _treeWeights, numFeatures),
+      extra).setParent(parent)
   }
 
   override def toString: String = {
-    s"GBTClassificationModel with $numTrees trees"
+    s"GBTClassificationModel (uid=$uid) with $numTrees trees"
   }
 
   /** (private[ml]) Convert to a model in the old API */
@@ -215,7 +226,8 @@ private[ml] object GBTClassificationModel {
   def fromOld(
       oldModel: OldGBTModel,
       parent: GBTClassifier,
-      categoricalFeatures: Map[Int, Int]): GBTClassificationModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): GBTClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert GradientBoostedTreesModel" +
       s" with algo=${oldModel.algo} (old API) to GBTClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -223,6 +235,6 @@ private[ml] object GBTClassificationModel {
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtc")
-    new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights)
+    new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 561d67677d2e2..74283dec065e0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -29,12 +29,12 @@ import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS._
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, lit}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -42,7 +42,7 @@ import org.apache.spark.storage.StorageLevel
  */
 private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol
-  with HasStandardization with HasThreshold {
+  with HasStandardization with HasWeightCol with HasThreshold {
 
   /**
    * Set threshold in binary classification, in range [0, 1].
@@ -146,6 +146,17 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   }
 }
 
+/**
+ * Class that represents an instance of weighted data point with label and features.
+ *
+ * TODO: Refactor this class to proper place.
+ *
+ * @param label Label for this data point.
+ * @param weight The weight of this instance.
+ * @param features The vector of features for this data point.
+ */
+private[classification] case class Instance(label: Double, weight: Double, features: Vector)
+
 /**
  * :: Experimental ::
  * Logistic regression.
@@ -218,6 +229,15 @@ class LogisticRegression(override val uid: String)
 
   override def getThreshold: Double = super.getThreshold
 
+  /**
+   * Whether to over-/under-sample training instances according to the given weights in weightCol.
+   * If empty, all instances are treated equally (weight 1.0).
+   * Default is empty, so all instances have weight one.
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+  setDefault(weightCol -> "")
+
   override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value)
 
   override def getThresholds: Array[Double] = super.getThresholds
@@ -247,22 +267,22 @@ class LogisticRegression(override val uid: String)
     val instances = extractLabeledPoints(dataset).map {
       case LabeledPoint(label: Double, features: Vector) => (label, features)
     }
+
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
-    val (summarizer, labelSummarizer) = instances.treeAggregate(
-      (new MultivariateOnlineSummarizer, new MultiClassSummarizer))(
-        seqOp = (c, v) => (c, v) match {
-          case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: MultiClassSummarizer),
-          (label: Double, features: Vector)) =>
-            (summarizer.add(features), labelSummarizer.add(label))
-        },
-        combOp = (c1, c2) => (c1, c2) match {
-          case ((summarizer1: MultivariateOnlineSummarizer,
-          classSummarizer1: MultiClassSummarizer), (summarizer2: MultivariateOnlineSummarizer,
-          classSummarizer2: MultiClassSummarizer)) =>
-            (summarizer1.merge(summarizer2), classSummarizer1.merge(classSummarizer2))
-      })
+    val (summarizer, labelSummarizer) = {
+      val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer),
+        instance: Instance) =>
+          (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight))
+
+      val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer),
+        c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) =>
+          (c1._1.merge(c2._1), c1._2.merge(c2._2))
+
+      instances.treeAggregate(
+        new MultivariateOnlineSummarizer, new MultiClassSummarizer)(seqOp, combOp)
+    }
 
     val histogram = labelSummarizer.histogram
     val numInvalid = labelSummarizer.countInvalid
@@ -317,7 +337,7 @@ class LogisticRegression(override val uid: String)
 
     val numFeaturesWithIntercept = if ($(fitIntercept)) numFeatures + 1 else numFeatures
     val userSuppliedWeights = validateWeights(optInitialWeights, numFeaturesWithIntercept)
-    val initialWeightsWithIntercept = userSuppliedWeights.getOrElse(
+    val initialCoefficientsWithIntercept = userSuppliedWeights.getOrElse(
       Vectors.zeros(numFeaturesWithIntercept))
 
     if ($(fitIntercept) && !userSuppliedWeights.isDefined) {
@@ -334,14 +354,14 @@ class LogisticRegression(override val uid: String)
          b = \log{P(1) / P(0)} = \log{count_1 / count_0}
          }}}
        */
-      initialWeightsWithIntercept.toArray(numFeatures)
-        = math.log(histogram(1).toDouble / histogram(0).toDouble)
+      initialCoefficientsWithIntercept.toArray(numFeatures)
+        = math.log(histogram(1) / histogram(0))
     }
 
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      initialWeightsWithIntercept.toBreeze.toDenseVector)
+      initialCoefficientsWithIntercept.toBreeze.toDenseVector)
 
-    val (weights, intercept, objectiveHistory) = {
+    val (coefficients, intercept, objectiveHistory) = {
       /*
          Note that in Logistic Regression, the objective history (loss + regularization)
          is log-likelihood which is invariance under feature standardization. As a result,
@@ -361,28 +381,29 @@ class LogisticRegression(override val uid: String)
       }
 
       /*
-         The weights are trained in the scaled space; we're converting them back to
+         The coefficients are trained in the scaled space; we're converting them back to
          the original space.
          Note that the intercept in scaled space and original space is the same;
          as a result, no scaling is needed.
        */
-      val rawWeights = state.x.toArray.clone()
+      val rawCoefficients = state.x.toArray.clone()
       var i = 0
       while (i < numFeatures) {
-        rawWeights(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
+        rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
         i += 1
       }
 
       if ($(fitIntercept)) {
-        (Vectors.dense(rawWeights.dropRight(1)).compressed, rawWeights.last, arrayBuilder.result())
+        (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last,
+          arrayBuilder.result())
       } else {
-        (Vectors.dense(rawWeights).compressed, 0.0, arrayBuilder.result())
+        (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result())
       }
     }
 
     if (handlePersistence) instances.unpersist()
 
-    val model = copyValues(new LogisticRegressionModel(uid, weights, intercept))
+    val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept))
     val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
       model.transform(dataset),
       $(probabilityCol),
@@ -425,6 +446,8 @@ class LogisticRegressionModel private[ml] (
     1.0 / (1.0 + math.exp(-m))
   }
 
+  override val numFeatures: Int = weights.size
+
   override val numClasses: Int = 2
 
   private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None
@@ -490,7 +513,9 @@ class LogisticRegressionModel private[ml] (
   }
 
   override def copy(extra: ParamMap): LogisticRegressionModel = {
-    copyValues(new LogisticRegressionModel(uid, weights, intercept), extra).setParent(parent)
+    val newModel = copyValues(new LogisticRegressionModel(uid, weights, intercept), extra)
+    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
+    newModel.setParent(parent)
   }
 
   override protected def raw2prediction(rawPrediction: Vector): Double = {
@@ -521,22 +546,29 @@ class LogisticRegressionModel private[ml] (
  * corresponding joint dataset.
  */
 private[classification] class MultiClassSummarizer extends Serializable {
-  private val distinctMap = new mutable.HashMap[Int, Long]
+  // The first element of value in distinctMap is the actually number of instances,
+  // and the second element of value is sum of the weights.
+  private val distinctMap = new mutable.HashMap[Int, (Long, Double)]
   private var totalInvalidCnt: Long = 0L
 
   /**
    * Add a new label into this MultilabelSummarizer, and update the distinct map.
    * @param label The label for this data point.
+   * @param weight The weight of this instances.
    * @return This MultilabelSummarizer
    */
-  def add(label: Double): this.type = {
+  def add(label: Double, weight: Double = 1.0): this.type = {
+    require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
+
+    if (weight == 0.0) return this
+
     if (label - label.toInt != 0.0 || label < 0) {
       totalInvalidCnt += 1
       this
     }
     else {
-      val counts: Long = distinctMap.getOrElse(label.toInt, 0L)
-      distinctMap.put(label.toInt, counts + 1)
+      val (counts: Long, weightSum: Double) = distinctMap.getOrElse(label.toInt, (0L, 0.0))
+      distinctMap.put(label.toInt, (counts + 1L, weightSum + weight))
       this
     }
   }
@@ -557,8 +589,8 @@ private[classification] class MultiClassSummarizer extends Serializable {
     }
     smallMap.distinctMap.foreach {
       case (key, value) =>
-        val counts = largeMap.distinctMap.getOrElse(key, 0L)
-        largeMap.distinctMap.put(key, counts + value)
+        val (counts: Long, weightSum: Double) = largeMap.distinctMap.getOrElse(key, (0L, 0.0))
+        largeMap.distinctMap.put(key, (counts + value._1, weightSum + value._2))
     }
     largeMap.totalInvalidCnt += smallMap.totalInvalidCnt
     largeMap
@@ -570,13 +602,13 @@ private[classification] class MultiClassSummarizer extends Serializable {
   /** @return The number of distinct labels in the input dataset. */
   def numClasses: Int = distinctMap.keySet.max + 1
 
-  /** @return The counts of each label in the input dataset. */
-  def histogram: Array[Long] = {
-    val result = Array.ofDim[Long](numClasses)
+  /** @return The weightSum of each label in the input dataset. */
+  def histogram: Array[Double] = {
+    val result = Array.ofDim[Double](numClasses)
     var i = 0
     val len = result.length
     while (i < len) {
-      result(i) = distinctMap.getOrElse(i, 0L)
+      result(i) = distinctMap.getOrElse(i, (0L, 0.0))._2
       i += 1
     }
     result
@@ -585,6 +617,8 @@ private[classification] class MultiClassSummarizer extends Serializable {
 
 /**
  * Abstraction for multinomial Logistic Regression Training results.
+ * Currently, the training summary ignores the training weights except
+ * for the objective trace.
  */
 sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary {
 
@@ -604,10 +638,10 @@ sealed trait LogisticRegressionSummary extends Serializable {
   /** Dataframe outputted by the model's `transform` method. */
   def predictions: DataFrame
 
-  /** Field in "predictions" which gives the calibrated probability of each sample as a vector. */
+  /** Field in "predictions" which gives the calibrated probability of each instance as a vector. */
   def probabilityCol: String
 
-  /** Field in "predictions" which gives the the true label of each sample. */
+  /** Field in "predictions" which gives the the true label of each instance. */
   def labelCol: String
 
 }
@@ -617,8 +651,8 @@ sealed trait LogisticRegressionSummary extends Serializable {
  * Logistic regression training results.
  * @param predictions dataframe outputted by the model's `transform` method.
  * @param probabilityCol field in "predictions" which gives the calibrated probability of
- *                       each sample as a vector.
- * @param labelCol field in "predictions" which gives the true label of each sample.
+ *                       each instance as a vector.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
  * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
 @Experimental
@@ -637,8 +671,8 @@ class BinaryLogisticRegressionTrainingSummary private[classification] (
  * Binary Logistic regression results for a given model.
  * @param predictions dataframe outputted by the model's `transform` method.
  * @param probabilityCol field in "predictions" which gives the calibrated probability of
- *                       each sample.
- * @param labelCol field in "predictions" which gives the true label of each sample.
+ *                       each instance.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
  */
 @Experimental
 class BinaryLogisticRegressionSummary private[classification] (
@@ -707,14 +741,14 @@ class BinaryLogisticRegressionSummary private[classification] (
 
 /**
  * LogisticAggregator computes the gradient and loss for binary logistic loss function, as used
- * in binary classification for samples in sparse or dense vector in a online fashion.
+ * in binary classification for instances in sparse or dense vector in a online fashion.
  *
  * Note that multinomial logistic loss is not supported yet!
  *
  * Two LogisticAggregator can be merged together to have a summary of loss and gradient of
  * the corresponding joint dataset.
  *
- * @param weights The weights/coefficients corresponding to the features.
+ * @param coefficients The coefficients corresponding to the features.
  * @param numClasses the number of possible outcomes for k classes classification problem in
  *                   Multinomial Logistic Regression.
  * @param fitIntercept Whether to fit an intercept term.
@@ -722,25 +756,25 @@ class BinaryLogisticRegressionSummary private[classification] (
  * @param featuresMean The mean values of the features.
  */
 private class LogisticAggregator(
-    weights: Vector,
+    coefficients: Vector,
     numClasses: Int,
     fitIntercept: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double]) extends Serializable {
 
-  private var totalCnt: Long = 0L
+  private var weightSum = 0.0
   private var lossSum = 0.0
 
-  private val weightsArray = weights match {
+  private val coefficientsArray = coefficients match {
     case dv: DenseVector => dv.values
     case _ =>
       throw new IllegalArgumentException(
-        s"weights only supports dense vector but got type ${weights.getClass}.")
+        s"coefficients only supports dense vector but got type ${coefficients.getClass}.")
   }
 
-  private val dim = if (fitIntercept) weightsArray.length - 1 else weightsArray.length
+  private val dim = if (fitIntercept) coefficientsArray.length - 1 else coefficientsArray.length
 
-  private val gradientSumArray = Array.ofDim[Double](weightsArray.length)
+  private val gradientSumArray = Array.ofDim[Double](coefficientsArray.length)
 
   /**
    * Add a new training data to this LogisticAggregator, and update the loss and gradient
@@ -749,13 +783,17 @@ private class LogisticAggregator(
    * @param label The label for this data point.
    * @param data The features for one data point in dense/sparse vector format to be added
    *             into this aggregator.
+   * @param weight The weight for over-/undersamples each of training instance. Default is one.
    * @return This LogisticAggregator object.
    */
-  def add(label: Double, data: Vector): this.type = {
-    require(dim == data.size, s"Dimensions mismatch when adding new sample." +
+  def add(label: Double, data: Vector, weight: Double = 1.0): this.type = {
+    require(dim == data.size, s"Dimensions mismatch when adding new instance." +
       s" Expecting $dim but got ${data.size}.")
+    require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
 
-    val localWeightsArray = weightsArray
+    if (weight == 0.0) return this
+
+    val localCoefficientsArray = coefficientsArray
     val localGradientSumArray = gradientSumArray
 
     numClasses match {
@@ -765,13 +803,13 @@ private class LogisticAggregator(
           var sum = 0.0
           data.foreachActive { (index, value) =>
             if (featuresStd(index) != 0.0 && value != 0.0) {
-              sum += localWeightsArray(index) * (value / featuresStd(index))
+              sum += localCoefficientsArray(index) * (value / featuresStd(index))
             }
           }
-          sum + { if (fitIntercept) localWeightsArray(dim) else 0.0 }
+          sum + { if (fitIntercept) localCoefficientsArray(dim) else 0.0 }
         }
 
-        val multiplier = (1.0 / (1.0 + math.exp(margin))) - label
+        val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
 
         data.foreachActive { (index, value) =>
           if (featuresStd(index) != 0.0 && value != 0.0) {
@@ -785,15 +823,15 @@ private class LogisticAggregator(
 
         if (label > 0) {
           // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
-          lossSum += MLUtils.log1pExp(margin)
+          lossSum += weight * MLUtils.log1pExp(margin)
         } else {
-          lossSum += MLUtils.log1pExp(margin) - margin
+          lossSum += weight * (MLUtils.log1pExp(margin) - margin)
         }
       case _ =>
         new NotImplementedError("LogisticRegression with ElasticNet in ML package only supports " +
           "binary classification for now.")
     }
-    totalCnt += 1
+    weightSum += weight
     this
   }
 
@@ -809,8 +847,8 @@ private class LogisticAggregator(
     require(dim == other.dim, s"Dimensions mismatch when merging with another " +
       s"LeastSquaresAggregator. Expecting $dim but got ${other.dim}.")
 
-    if (other.totalCnt != 0) {
-      totalCnt += other.totalCnt
+    if (other.weightSum != 0.0) {
+      weightSum += other.weightSum
       lossSum += other.lossSum
 
       var i = 0
@@ -825,13 +863,17 @@ private class LogisticAggregator(
     this
   }
 
-  def count: Long = totalCnt
-
-  def loss: Double = lossSum / totalCnt
+  def loss: Double = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
+    lossSum / weightSum
+  }
 
   def gradient: Vector = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
     val result = Vectors.dense(gradientSumArray.clone())
-    scal(1.0 / totalCnt, result)
+    scal(1.0 / weightSum, result)
     result
   }
 }
@@ -843,7 +885,7 @@ private class LogisticAggregator(
  * It's used in Breeze's convex optimization routines.
  */
 private class LogisticCostFun(
-    data: RDD[(Double, Vector)],
+    data: RDD[Instance],
     numClasses: Int,
     fitIntercept: Boolean,
     standardization: Boolean,
@@ -851,22 +893,23 @@ private class LogisticCostFun(
     featuresMean: Array[Double],
     regParamL2: Double) extends DiffFunction[BDV[Double]] {
 
-  override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = {
+  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
     val numFeatures = featuresStd.length
-    val w = Vectors.fromBreeze(weights)
+    val w = Vectors.fromBreeze(coefficients)
 
-    val logisticAggregator = data.treeAggregate(new LogisticAggregator(w, numClasses, fitIntercept,
-      featuresStd, featuresMean))(
-        seqOp = (c, v) => (c, v) match {
-          case (aggregator, (label, features)) => aggregator.add(label, features)
-        },
-        combOp = (c1, c2) => (c1, c2) match {
-          case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
-        })
+    val logisticAggregator = {
+      val seqOp = (c: LogisticAggregator, instance: Instance) =>
+        c.add(instance.label, instance.features, instance.weight)
+      val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2)
+
+      data.treeAggregate(
+        new LogisticAggregator(w, numClasses, fitIntercept, featuresStd, featuresMean)
+      )(seqOp, combOp)
+    }
 
     val totalGradientArray = logisticAggregator.gradient.toArray
 
-    // regVal is the sum of weight squares excluding intercept for L2 regularization.
+    // regVal is the sum of coefficients squares excluding intercept for L2 regularization.
     val regVal = if (regParamL2 == 0.0) {
       0.0
     } else {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 1e5b0bc4453e4..cd7462596dd9e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml.classification
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param.shared.{HasTol, HasMaxIter, HasSeed}
 import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
@@ -32,6 +34,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
   with HasSeed with HasMaxIter with HasTol {
   /**
    * Layer sizes including input size and output size.
+   * Default: Array(1, 1)
    * @group param
    */
   final val layers: IntArrayParam = new IntArrayParam(this, "layers",
@@ -50,6 +53,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
    * Data is stacked within partitions. If block size is more than remaining data in
    * a partition then it is adjusted to the size of this data.
    * Recommended size is between 10 and 1000.
+   * Default: 128
    * @group expertParam
    */
   final val blockSize: IntParam = new IntParam(this, "blockSize",
@@ -177,8 +181,17 @@ class MultilayerPerceptronClassificationModel private[ml] (
   extends PredictionModel[Vector, MultilayerPerceptronClassificationModel]
   with Serializable {
 
+  override val numFeatures: Int = layers.head
+
   private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights)
 
+  /**
+   * Returns layers in a Java List.
+   */
+  private[ml] def javaLayers: java.util.List[Int] = {
+    layers.toList.asJava
+  }
+
   /**
    * Predict label for the given features.
    * This internal method is used to implement [[transform()]] and output [[predictionCol]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 69cb88a7e6718..a14dcecbaf5b9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -137,6 +137,8 @@ class NaiveBayesModel private[ml] (
       throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
   }
 
+  override val numFeatures: Int = theta.numCols
+
   override val numClasses: Int = pi.size
 
   private def multinomialCalculation(features: Vector) = {
@@ -198,7 +200,7 @@ class NaiveBayesModel private[ml] (
   }
 
   override def toString: String = {
-    s"NaiveBayesModel with ${pi.size} classes"
+    s"NaiveBayesModel (uid=$uid) with ${pi.size} classes"
   }
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index c62e132f5d533..debc164bf2432 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -91,7 +91,6 @@ final class OneVsRestModel private[ml] (
     // add an accumulator column to store predictions of all the models
     val accColName = "mbc$acc" + UUID.randomUUID().toString
     val initUDF = udf { () => Map[Int, Double]() }
-    val mapType = MapType(IntegerType, DoubleType, valueContainsNull = false)
     val newDataset = dataset.withColumn(accColName, initUDF())
 
     // persist if underlying dataset is not persistent.
@@ -195,16 +194,11 @@ final class OneVsRest(override val uid: String)
 
     // create k columns, one for each binary classifier.
     val models = Range(0, numClasses).par.map { index =>
-      val labelUDF = udf { (label: Double) =>
-        if (label.toInt == index) 1.0 else 0.0
-      }
-
       // generate new label metadata for the binary problem.
-      // TODO: use when ... otherwise after SPARK-7321 is merged
       val newLabelMeta = BinaryAttribute.defaultAttr.withName("label").toMetadata()
       val labelColName = "mc2b$" + index
-      val trainingDataset =
-        multiclassLabeled.withColumn(labelColName, labelUDF(col($(labelCol))), newLabelMeta)
+      val trainingDataset = multiclassLabeled.withColumn(
+        labelColName, when(col($(labelCol)) === index.toDouble, 1.0).otherwise(0.0), newLabelMeta)
       val classifier = getClassifier
       val paramMap = new ParamMap()
       paramMap.put(classifier.labelCol -> labelColName)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 11a6d72468333..bae329692a68d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -119,13 +119,12 @@ object RandomForestClassifier {
  * features.
  * @param _trees  Decision trees in the ensemble.
  *               Warning: These have null parents.
- * @param numFeatures  Number of features used by this model
  */
 @Experimental
 final class RandomForestClassificationModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeClassificationModel],
-    val numFeatures: Int,
+    override val numFeatures: Int,
     override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel]
   with TreeEnsembleModel with Serializable {
@@ -193,7 +192,7 @@ final class RandomForestClassificationModel private[ml] (
   }
 
   override def toString: String = {
-    s"RandomForestClassificationModel with $numTrees trees"
+    s"RandomForestClassificationModel (uid=$uid) with $numTrees trees"
   }
 
   /**
@@ -226,7 +225,8 @@ private[ml] object RandomForestClassificationModel {
       oldModel: OldRandomForestModel,
       parent: RandomForestClassifier,
       categoricalFeatures: Map[Int, Int],
-      numClasses: Int): RandomForestClassificationModel = {
+      numClasses: Int,
+      numFeatures: Int = -1): RandomForestClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -234,6 +234,6 @@ private[ml] object RandomForestClassificationModel {
       DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc")
-    new RandomForestClassificationModel(uid, newTrees, -1, numClasses)
+    new RandomForestClassificationModel(uid, newTrees, numFeatures, numClasses)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 56419a0a15952..08df2919a8a87 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -38,6 +38,7 @@ class BinaryClassificationEvaluator(override val uid: String)
 
   /**
    * param for metric name in evaluation
+   * Default: areaUnderROC
    * @group param
    */
   val metricName: Param[String] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
index 46314854d5e3a..edad754436455 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala
@@ -41,6 +41,7 @@ final class Binarizer(override val uid: String)
    * Param for threshold used to binarize continuous features.
    * The features greater than the threshold, will be binarized to 1.0.
    * The features equal to or less than the threshold, will be binarized to 0.0.
+   * Default: 0.0
    * @group param
    */
   val threshold: DoubleParam =
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 938447447a0a2..4c36df75d8aa0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -35,6 +35,7 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol
 
   /**
    * The minimum of documents in which a term should appear.
+   * Default: 0
    * @group param
    */
   final val minDocFreq = new IntParam(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
new file mode 100644
index 0000000000000..37f7862476cfe
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable.ArrayBuilder
+
+import org.apache.spark.SparkException
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.Transformer
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+ * :: Experimental ::
+ * Implements the feature interaction transform. This transformer takes in Double and Vector type
+ * columns and outputs a flattened vector of their feature interactions. To handle interaction,
+ * we first one-hot encode any nominal features. Then, a vector of the feature cross-products is
+ * produced.
+ *
+ * For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be
+ * `Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal
+ * with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.
+ */
+@Experimental
+class Interaction(override val uid: String) extends Transformer
+  with HasInputCols with HasOutputCol {
+
+  def this() = this(Identifiable.randomUID("interaction"))
+
+  /** @group setParam */
+  def setInputCols(values: Array[String]): this.type = set(inputCols, values)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  // optimistic schema; does not contain any ML attributes
+  override def transformSchema(schema: StructType): StructType = {
+    validateParams()
+    StructType(schema.fields :+ StructField($(outputCol), new VectorUDT, false))
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    validateParams()
+    val inputFeatures = $(inputCols).map(c => dataset.schema(c))
+    val featureEncoders = getFeatureEncoders(inputFeatures)
+    val featureAttrs = getFeatureAttrs(inputFeatures)
+
+    def interactFunc = udf { row: Row =>
+      var indices = ArrayBuilder.make[Int]
+      var values = ArrayBuilder.make[Double]
+      var size = 1
+      indices += 0
+      values += 1.0
+      var featureIndex = row.length - 1
+      while (featureIndex >= 0) {
+        val prevIndices = indices.result()
+        val prevValues = values.result()
+        val prevSize = size
+        val currentEncoder = featureEncoders(featureIndex)
+        indices = ArrayBuilder.make[Int]
+        values = ArrayBuilder.make[Double]
+        size *= currentEncoder.outputSize
+        currentEncoder.foreachNonzeroOutput(row(featureIndex), (i, a) => {
+          var j = 0
+          while (j < prevIndices.length) {
+            indices += prevIndices(j) + i * prevSize
+            values += prevValues(j) * a
+            j += 1
+          }
+        })
+        featureIndex -= 1
+      }
+      Vectors.sparse(size, indices.result(), values.result()).compressed
+    }
+
+    val featureCols = inputFeatures.map { f =>
+      f.dataType match {
+        case DoubleType => dataset(f.name)
+        case _: VectorUDT => dataset(f.name)
+        case _: NumericType | BooleanType => dataset(f.name).cast(DoubleType)
+      }
+    }
+    dataset.select(
+      col("*"),
+      interactFunc(struct(featureCols: _*)).as($(outputCol), featureAttrs.toMetadata()))
+  }
+
+  /**
+   * Creates a feature encoder for each input column, which supports efficient iteration over
+   * one-hot encoded feature values. See also the class-level comment of [[FeatureEncoder]].
+   *
+   * @param features The input feature columns to create encoders for.
+   */
+  private def getFeatureEncoders(features: Seq[StructField]): Array[FeatureEncoder] = {
+    def getNumFeatures(attr: Attribute): Int = {
+      attr match {
+        case nominal: NominalAttribute =>
+          math.max(1, nominal.getNumValues.getOrElse(
+            throw new SparkException("Nominal features must have attr numValues defined.")))
+        case _ =>
+          1  // numeric feature
+      }
+    }
+    features.map { f =>
+      val numFeatures = f.dataType match {
+        case _: NumericType | BooleanType =>
+          Array(getNumFeatures(Attribute.fromStructField(f)))
+        case _: VectorUDT =>
+          val attrs = AttributeGroup.fromStructField(f).attributes.getOrElse(
+            throw new SparkException("Vector attributes must be defined for interaction."))
+          attrs.map(getNumFeatures).toArray
+      }
+      new FeatureEncoder(numFeatures)
+    }.toArray
+  }
+
+  /**
+   * Generates ML attributes for the output vector of all feature interactions. We make a best
+   * effort to generate reasonable names for output features, based on the concatenation of the
+   * interacting feature names and values delimited with `_`. When no feature name is specified,
+   * we fall back to using the feature index (e.g. `foo:bar_2_0` may indicate an interaction
+   * between the numeric `foo` feature and a nominal third feature from column `bar`.
+   *
+   * @param features The input feature columns to the Interaction transformer.
+   */
+  private def getFeatureAttrs(features: Seq[StructField]): AttributeGroup = {
+    var featureAttrs: Seq[Attribute] = Nil
+    features.reverse.foreach { f =>
+      val encodedAttrs = f.dataType match {
+        case _: NumericType | BooleanType =>
+          val attr = Attribute.decodeStructField(f, preserveName = true)
+          if (attr == UnresolvedAttribute) {
+            encodedFeatureAttrs(Seq(NumericAttribute.defaultAttr.withName(f.name)), None)
+          } else if (!attr.name.isDefined) {
+            encodedFeatureAttrs(Seq(attr.withName(f.name)), None)
+          } else {
+            encodedFeatureAttrs(Seq(attr), None)
+          }
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(f)
+          encodedFeatureAttrs(group.attributes.get, Some(group.name))
+      }
+      if (featureAttrs.isEmpty) {
+        featureAttrs = encodedAttrs
+      } else {
+        featureAttrs = encodedAttrs.flatMap { head =>
+          featureAttrs.map { tail =>
+            NumericAttribute.defaultAttr.withName(head.name.get + ":" + tail.name.get)
+          }
+        }
+      }
+    }
+    new AttributeGroup($(outputCol), featureAttrs.toArray)
+  }
+
+  /**
+   * Generates the output ML attributes for a single input feature. Each output feature name has
+   * up to three parts: the group name, feature name, and category name (for nominal features),
+   * each separated by an underscore.
+   *
+   * @param inputAttrs The attributes of the input feature.
+   * @param groupName Optional name of the input feature group (for Vector type features).
+   */
+  private def encodedFeatureAttrs(
+      inputAttrs: Seq[Attribute],
+      groupName: Option[String]): Seq[Attribute] = {
+
+    def format(
+        index: Int,
+        attrName: Option[String],
+        categoryName: Option[String]): String = {
+      val parts = Seq(groupName, Some(attrName.getOrElse(index.toString)), categoryName)
+      parts.flatten.mkString("_")
+    }
+
+    inputAttrs.zipWithIndex.flatMap {
+      case (nominal: NominalAttribute, i) =>
+        if (nominal.values.isDefined) {
+          nominal.values.get.map(
+            v => BinaryAttribute.defaultAttr.withName(format(i, nominal.name, Some(v))))
+        } else {
+          Array.tabulate(nominal.getNumValues.get)(
+            j => BinaryAttribute.defaultAttr.withName(format(i, nominal.name, Some(j.toString))))
+        }
+      case (a: Attribute, i) =>
+        Seq(NumericAttribute.defaultAttr.withName(format(i, a.name, None)))
+    }
+  }
+
+  override def copy(extra: ParamMap): Interaction = defaultCopy(extra)
+
+  override def validateParams(): Unit = {
+    require(get(inputCols).isDefined, "Input cols must be defined first.")
+    require(get(outputCol).isDefined, "Output col must be defined first.")
+    require($(inputCols).length > 0, "Input cols must have non-zero length.")
+    require($(inputCols).distinct.length == $(inputCols).length, "Input cols must be distinct.")
+  }
+}
+
+/**
+ * This class performs on-the-fly one-hot encoding of features as you iterate over them. To
+ * indicate which input features should be one-hot encoded, an array of the feature counts
+ * must be passed in ahead of time.
+ *
+ * @param numFeatures Array of feature counts for each input feature. For nominal features this
+ *                    count is equal to the number of categories. For numeric features the count
+ *                    should be set to 1.
+ */
+private[ml] class FeatureEncoder(numFeatures: Array[Int]) extends Serializable {
+  assert(numFeatures.forall(_ > 0), "Features counts must all be positive.")
+
+  /** The size of the output vector. */
+  val outputSize = numFeatures.sum
+
+  /** Precomputed offsets for the location of each output feature. */
+  private val outputOffsets = {
+    val arr = new Array[Int](numFeatures.length)
+    var i = 1
+    while (i < arr.length) {
+      arr(i) = arr(i - 1) + numFeatures(i - 1)
+      i += 1
+    }
+    arr
+  }
+
+  /**
+   * Given an input row of features, invokes the specific function for every non-zero output.
+   *
+   * @param value The row value to encode, either a Double or Vector.
+   * @param f The callback to invoke on each non-zero (index, value) output pair.
+   */
+  def foreachNonzeroOutput(value: Any, f: (Int, Double) => Unit): Unit = value match {
+    case d: Double =>
+      assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.")
+      val numOutputCols = numFeatures.head
+      if (numOutputCols > 1) {
+        assert(
+          d >= 0.0 && d == d.toInt && d < numOutputCols,
+          s"Values from column must be indices, but got $d.")
+        f(d.toInt, 1.0)
+      } else {
+        f(0, d)
+      }
+    case vec: Vector =>
+      assert(numFeatures.length == vec.size,
+        s"Vector column size was ${vec.size}, expected ${numFeatures.length}")
+      vec.foreachActive { (i, v) =>
+        val numOutputCols = numFeatures(i)
+        if (numOutputCols > 1) {
+          assert(
+            v >= 0.0 && v == v.toInt && v < numOutputCols,
+            s"Values from column must be indices, but got $v.")
+          f(outputOffsets(i) + v.toInt, 1.0)
+        } else {
+          f(outputOffsets(i), v)
+        }
+      }
+    case null =>
+      throw new SparkException("Values to interact cannot be null.")
+    case o =>
+      throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index a7fa50444209b..f9b840097f3ed 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -21,6 +21,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
@@ -42,8 +43,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
- * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the
- * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ * we support a limited subset of the R operators, including '~', '.', ':', '+', and '-'. Also see
+ * the R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
 class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {
@@ -82,36 +83,54 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
     require(isDefined(formula), "Formula must be defined first.")
     val parsedFormula = RFormulaParser.parse($(formula))
     val resolvedFormula = parsedFormula.resolve(dataset.schema)
-    // StringType terms and terms representing interactions need to be encoded before assembly.
-    // TODO(ekl) add support for feature interactions
     val encoderStages = ArrayBuffer[PipelineStage]()
+
+    val prefixesToRewrite = mutable.Map[String, String]()
     val tempColumns = ArrayBuffer[String]()
-    val takenNames = mutable.Set(dataset.columns: _*)
-    val encodedTerms = resolvedFormula.terms.map { term =>
+    def tmpColumn(category: String): String = {
+      val col = Identifiable.randomUID(category)
+      tempColumns += col
+      col
+    }
+
+    // First we index each string column referenced by the input terms.
+    val indexed: Map[String, String] = resolvedFormula.terms.flatten.distinct.map { term =>
       dataset.schema(term) match {
         case column if column.dataType == StringType =>
-          val indexCol = term + "_idx_" + uid
-          val encodedCol = {
-            var tmp = term
-            while (takenNames.contains(tmp)) {
-              tmp += "_"
-            }
-            tmp
-          }
-          takenNames.add(indexCol)
-          takenNames.add(encodedCol)
-          encoderStages += new StringIndexer().setInputCol(term).setOutputCol(indexCol)
-          encoderStages += new OneHotEncoder().setInputCol(indexCol).setOutputCol(encodedCol)
-          tempColumns += indexCol
-          tempColumns += encodedCol
-          encodedCol
+          val indexCol = tmpColumn("stridx")
+          encoderStages += new StringIndexer()
+            .setInputCol(term)
+            .setOutputCol(indexCol)
+          (term, indexCol)
         case _ =>
-          term
+          (term, term)
       }
+    }.toMap
+
+    // Then we handle one-hot encoding and interactions between terms.
+    val encodedTerms = resolvedFormula.terms.map {
+      case Seq(term) if dataset.schema(term).dataType == StringType =>
+        val encodedCol = tmpColumn("onehot")
+        encoderStages += new OneHotEncoder()
+          .setInputCol(indexed(term))
+          .setOutputCol(encodedCol)
+        prefixesToRewrite(encodedCol + "_") = term + "_"
+        encodedCol
+      case Seq(term) =>
+        term
+      case terms =>
+        val interactionCol = tmpColumn("interaction")
+        encoderStages += new Interaction()
+          .setInputCols(terms.map(indexed).toArray)
+          .setOutputCol(interactionCol)
+        prefixesToRewrite(interactionCol + "_") = ""
+        interactionCol
     }
+
     encoderStages += new VectorAssembler(uid)
       .setInputCols(encodedTerms.toArray)
       .setOutputCol($(featuresCol))
+    encoderStages += new VectorAttributeRewriter($(featuresCol), prefixesToRewrite.toMap)
     encoderStages += new ColumnPruner(tempColumns.toSet)
     val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
     copyValues(new RFormulaModel(uid, resolvedFormula, pipelineModel).setParent(this))
@@ -129,7 +148,7 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
 
   override def copy(extra: ParamMap): RFormula = defaultCopy(extra)
 
-  override def toString: String = s"RFormula(${get(formula)})"
+  override def toString: String = s"RFormula(${get(formula)}) (uid=$uid)"
 }
 
 /**
@@ -171,7 +190,7 @@ class RFormulaModel private[feature](
   override def copy(extra: ParamMap): RFormulaModel = copyValues(
     new RFormulaModel(uid, resolvedFormula, pipelineModel))
 
-  override def toString: String = s"RFormulaModel(${resolvedFormula})"
+  override def toString: String = s"RFormulaModel(${resolvedFormula}) (uid=$uid)"
 
   private def transformLabel(dataset: DataFrame): DataFrame = {
     val labelName = resolvedFormula.label
@@ -218,3 +237,53 @@ private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
 
   override def copy(extra: ParamMap): ColumnPruner = defaultCopy(extra)
 }
+
+/**
+ * Utility transformer that rewrites Vector attribute names via prefix replacement. For example,
+ * it can rewrite attribute names starting with 'foo_' to start with 'bar_' instead.
+ *
+ * @param vectorCol name of the vector column to rewrite.
+ * @param prefixesToRewrite the map of string prefixes to their replacement values. Each attribute
+ *                          name defined in vectorCol will be checked against the keys of this
+ *                          map. When a key prefixes a name, the matching prefix will be replaced
+ *                          by the value in the map.
+ */
+private class VectorAttributeRewriter(
+    vectorCol: String,
+    prefixesToRewrite: Map[String, String])
+  extends Transformer {
+
+  override val uid = Identifiable.randomUID("vectorAttrRewriter")
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val metadata = {
+      val group = AttributeGroup.fromStructField(dataset.schema(vectorCol))
+      val attrs = group.attributes.get.map { attr =>
+        if (attr.name.isDefined) {
+          val name = attr.name.get
+          val replacement = prefixesToRewrite.filter { case (k, _) => name.startsWith(k) }
+          if (replacement.nonEmpty) {
+            val (k, v) = replacement.headOption.get
+            attr.withName(v + name.stripPrefix(k))
+          } else {
+            attr
+          }
+        } else {
+          attr
+        }
+      }
+      new AttributeGroup(vectorCol, attrs).toMetadata()
+    }
+    val otherCols = dataset.columns.filter(_ != vectorCol).map(dataset.col)
+    val rewrittenCol = dataset.col(vectorCol).as(vectorCol, metadata)
+    dataset.select((otherCols :+ rewrittenCol): _*)
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    StructType(
+      schema.fields.filter(_.name != vectorCol) ++
+      schema.fields.filter(_.name == vectorCol))
+  }
+
+  override def copy(extra: ParamMap): VectorAttributeRewriter = defaultCopy(extra)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
index 1ca3b92a7d92a..4079b387e1834 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.feature
 
+import scala.collection.mutable
 import scala.util.parsing.combinator.RegexParsers
 
 import org.apache.spark.mllib.linalg.VectorUDT
@@ -31,27 +32,35 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
    * of the special '.' term. Duplicate terms will be removed during resolution.
    */
   def resolve(schema: StructType): ResolvedRFormula = {
-    var includedTerms = Seq[String]()
+    val dotTerms = expandDot(schema)
+    var includedTerms = Seq[Seq[String]]()
     terms.foreach {
+      case col: ColumnRef =>
+        includedTerms :+= Seq(col.value)
+      case ColumnInteraction(cols) =>
+        includedTerms ++= expandInteraction(schema, cols)
       case Dot =>
-        includedTerms ++= simpleTypes(schema).filter(_ != label.value)
-      case ColumnRef(value) =>
-        includedTerms :+= value
+        includedTerms ++= dotTerms.map(Seq(_))
       case Deletion(term: Term) =>
         term match {
-          case ColumnRef(value) =>
-            includedTerms = includedTerms.filter(_ != value)
+          case inner: ColumnRef =>
+            includedTerms = includedTerms.filter(_ != Seq(inner.value))
+          case ColumnInteraction(cols) =>
+            val fromInteraction = expandInteraction(schema, cols).map(_.toSet)
+            includedTerms = includedTerms.filter(t => !fromInteraction.contains(t.toSet))
           case Dot =>
             // e.g. "- .", which removes all first-order terms
-            val fromSchema = simpleTypes(schema)
-            includedTerms = includedTerms.filter(fromSchema.contains(_))
+            includedTerms = includedTerms.filter {
+              case Seq(t) => !dotTerms.contains(t)
+              case _ => true
+            }
           case _: Deletion =>
-            assert(false, "Deletion terms cannot be nested")
+            throw new RuntimeException("Deletion terms cannot be nested")
           case _: Intercept =>
         }
       case _: Intercept =>
     }
-    ResolvedRFormula(label.value, includedTerms.distinct)
+    ResolvedRFormula(label.value, includedTerms.distinct, hasIntercept)
   }
 
   /** Whether this formula specifies fitting with an intercept term. */
@@ -67,19 +76,54 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
     intercept
   }
 
+  // expands the Dot operators in interaction terms
+  private def expandInteraction(
+      schema: StructType, terms: Seq[InteractableTerm]): Seq[Seq[String]] = {
+    if (terms.isEmpty) {
+      return Seq(Nil)
+    }
+
+    val rest = expandInteraction(schema, terms.tail)
+    val validInteractions = (terms.head match {
+      case Dot =>
+        expandDot(schema).flatMap { t =>
+          rest.map { r =>
+            Seq(t) ++ r
+          }
+        }
+      case ColumnRef(value) =>
+        rest.map(Seq(value) ++ _)
+    }).map(_.distinct)
+
+    // Deduplicates feature interactions, for example, a:b is the same as b:a.
+    var seen = mutable.Set[Set[String]]()
+    validInteractions.flatMap {
+      case t if seen.contains(t.toSet) =>
+        None
+      case t =>
+        seen += t.toSet
+        Some(t)
+    }.sortBy(_.length)
+  }
+
   // the dot operator excludes complex column types
-  private def simpleTypes(schema: StructType): Seq[String] = {
+  private def expandDot(schema: StructType): Seq[String] = {
     schema.fields.filter(_.dataType match {
       case _: NumericType | StringType | BooleanType | _: VectorUDT => true
       case _ => false
-    }).map(_.name)
+    }).map(_.name).filter(_ != label.value)
   }
 }
 
 /**
  * Represents a fully evaluated and simplified R formula.
+ * @param label the column name of the R formula label (response variable).
+ * @param terms the simplified terms of the R formula. Interactions terms are represented as Seqs
+ *              of column names; non-interaction terms as length 1 Seqs.
+ * @param hasIntercept whether the formula specifies fitting with an intercept.
  */
-private[ml] case class ResolvedRFormula(label: String, terms: Seq[String])
+private[ml] case class ResolvedRFormula(
+  label: String, terms: Seq[Seq[String]], hasIntercept: Boolean)
 
 /**
  * R formula terms. See the R formula docs here for more information:
@@ -87,11 +131,17 @@ private[ml] case class ResolvedRFormula(label: String, terms: Seq[String])
  */
 private[ml] sealed trait Term
 
+/** A term that may be part of an interaction, e.g. 'x' in 'x:y' */
+private[ml] sealed trait InteractableTerm extends Term
+
 /* R formula reference to all available columns, e.g. "." in a formula */
-private[ml] case object Dot extends Term
+private[ml] case object Dot extends InteractableTerm
 
 /* R formula reference to a column, e.g. "+ Species" in a formula */
-private[ml] case class ColumnRef(value: String) extends Term
+private[ml] case class ColumnRef(value: String) extends InteractableTerm
+
+/* R formula interaction of several columns, e.g. "Sepal_Length:Species" in a formula */
+private[ml] case class ColumnInteraction(terms: Seq[InteractableTerm]) extends Term
 
 /* R formula intercept toggle, e.g. "+ 0" in a formula */
 private[ml] case class Intercept(enabled: Boolean) extends Term
@@ -100,25 +150,30 @@ private[ml] case class Intercept(enabled: Boolean) extends Term
 private[ml] case class Deletion(term: Term) extends Term
 
 /**
- * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.'.
+ * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.', ':'.
  */
 private[ml] object RFormulaParser extends RegexParsers {
-  def intercept: Parser[Intercept] =
+  private val intercept: Parser[Intercept] =
     "([01])".r ^^ { case a => Intercept(a == "1") }
 
-  def columnRef: Parser[ColumnRef] =
+  private val columnRef: Parser[ColumnRef] =
     "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) }
 
-  def term: Parser[Term] = intercept | columnRef | "\\.".r ^^ { case _ => Dot }
+  private val dot: Parser[InteractableTerm] = "\\.".r ^^ { case _ => Dot }
+
+  private val interaction: Parser[List[InteractableTerm]] = rep1sep(columnRef | dot, ":")
+
+  private val term: Parser[Term] = intercept |
+    interaction ^^ { case terms => ColumnInteraction(terms) } | dot | columnRef
 
-  def terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ {
+  private val terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ {
     case op ~ list => list.foldLeft(List(op)) {
       case (left, "+" ~ right) => left ++ Seq(right)
       case (left, "-" ~ right) => left ++ Seq(Deletion(right))
     }
   }
 
-  def formula: Parser[ParsedRFormula] =
+  private val formula: Parser[ParsedRFormula] =
     (columnRef ~ "~" ~ terms) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
 
   def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 5d77ea08db657..2a79582625e9a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -29,14 +29,14 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructTyp
 /**
  * stop words list
  */
-private object StopWords {
+private[spark] object StopWords {
 
   /**
    * Use the same default stopwords list as scikit-learn.
    * The original list can be found from "Glasgow Information Retrieval Group"
    * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]]
    */
-  val EnglishStopWords = Array( "a", "about", "above", "across", "after", "afterwards", "again",
+  val English = Array( "a", "about", "above", "across", "after", "afterwards", "again",
     "against", "all", "almost", "alone", "along", "already", "also", "although", "always",
     "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
     "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
@@ -98,6 +98,7 @@ class StopWordsRemover(override val uid: String)
 
   /**
    * the stop words set to be filtered out
+   * Default: [[StopWords.English]]
    * @group param
    */
   val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words")
@@ -110,6 +111,7 @@ class StopWordsRemover(override val uid: String)
 
   /**
    * whether to do a case sensitive comparison over the stop words
+   * Default: false
    * @group param
    */
   val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive",
@@ -121,7 +123,7 @@ class StopWordsRemover(override val uid: String)
   /** @group getParam */
   def getCaseSensitive: Boolean = $(caseSensitive)
 
-  setDefault(stopWords -> StopWords.EnglishStopWords, caseSensitive -> false)
+  setDefault(stopWords -> StopWords.English, caseSensitive -> false)
 
   override def transform(dataset: DataFrame): DataFrame = {
     val outputSchema = transformSchema(dataset.schema)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 24250e4c4cf92..486274cd75a14 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DoubleType, NumericType, StringType, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -102,7 +102,7 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod
  * [[StringIndexerModel.transform]] would return the input dataset unmodified.
  * This is a temporary fix for the case when target labels do not exist during prediction.
  *
- * @param labels  Ordered list of labels, corresponding to indices to be assigned
+ * @param labels  Ordered list of labels, corresponding to indices to be assigned.
  */
 @Experimental
 class StringIndexerModel (
@@ -147,9 +147,8 @@ class StringIndexerModel (
       }
     }
 
-    val outputColName = $(outputCol)
     val metadata = NominalAttribute.defaultAttr
-      .withName(outputColName).withValues(labels).toMetadata()
+      .withName($(inputCol)).withValues(labels).toMetadata()
     // If we are skipping invalid records, filter them out.
     val filteredDataset = (getHandleInvalid) match {
       case "skip" => {
@@ -161,7 +160,7 @@ class StringIndexerModel (
       case _ => dataset
     }
     filteredDataset.select(col("*"),
-      indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata))
+      indexer(dataset($(inputCol)).cast(StringType)).as($(outputCol), metadata))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -181,10 +180,10 @@ class StringIndexerModel (
 
 /**
  * :: Experimental ::
- * A [[Transformer]] that maps a column of string indices back to a new column of corresponding
- * string values using either the ML attributes of the input column, or if provided using the labels
- * supplied by the user.
- * All original columns are kept during transformation.
+ * A [[Transformer]] that maps a column of indices back to a new column of corresponding
+ * string values.
+ * The index-string mapping is either from the ML attributes of the input column,
+ * or from user-supplied labels (which take precedence over ML attributes).
  *
  * @see [[StringIndexer]] for converting strings into indices
  */
@@ -202,32 +201,23 @@ class IndexToString private[ml] (
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /**
-   * Optional labels to be provided by the user, if not supplied column
-   * metadata is read for labels. The default value is an empty array,
-   * but the empty array is ignored and column metadata used instead.
-   * @group setParam
-   */
+  /** @group setParam */
   def setLabels(value: Array[String]): this.type = set(labels, value)
 
   /**
-   * Param for array of labels.
-   * Optional labels to be provided by the user, if not supplied column
-   * metadata is read for labels.
+   * Optional param for array of labels specifying index-string mapping.
+   *
+   * Default: Empty array, in which case [[inputCol]] metadata is used for labels.
    * @group param
    */
   final val labels: StringArrayParam = new StringArrayParam(this, "labels",
-    "array of labels, if not provided metadata from inputCol is used instead.")
+    "Optional array of labels specifying index-string mapping." +
+      " If not provided or if empty, then metadata from inputCol is used instead.")
   setDefault(labels, Array.empty[String])
 
-  /**
-   * Optional labels to be provided by the user, if not supplied column
-   * metadata is read for labels.
-   * @group getParam
-   */
+  /** @group getParam */
   final def getLabels: Array[String] = $(labels)
 
-  /** Transform the schema for the inverse transformation */
   override def transformSchema(schema: StructType): StructType = {
     val inputColName = $(inputCol)
     val inputDataType = schema(inputColName).dataType
@@ -238,8 +228,7 @@ class IndexToString private[ml] (
     val outputColName = $(outputCol)
     require(inputFields.forall(_.name != outputColName),
       s"Output column $outputColName already exists.")
-    val attr = NominalAttribute.defaultAttr.withName($(outputCol))
-    val outputFields = inputFields :+ attr.toStructField()
+    val outputFields = inputFields :+ StructField($(outputCol), StringType)
     StructType(outputFields)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 61b925c0fdc07..52e0599e38d83 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -43,6 +43,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
    * Must be >= 2.
    *
    * (default = 20)
+   * @group param
    */
   val maxCategories = new IntParam(this, "maxCategories",
     "Threshold for the number of values a categorical feature can take (>= 2)." +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
index c5c2272270792..fb3387d4aa9be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
@@ -49,6 +49,7 @@ final class VectorSlicer(override val uid: String)
   /**
    * An array of indices to select features from a vector column.
    * There can be no overlap with [[names]].
+   * Default: Empty array
    * @group param
    */
   val indices = new IntArrayParam(this, "indices",
@@ -67,6 +68,7 @@ final class VectorSlicer(override val uid: String)
    * An array of feature names to select features from a vector column.
    * These names must be specified by ML [[org.apache.spark.ml.attribute.Attribute]]s.
    * There can be no overlap with [[indices]].
+   * Default: Empty Array
    * @group param
    */
   val names = new StringArrayParam(this, "names",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 5af775a4159ad..9edab3af913ca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -39,6 +39,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * The dimension of the code that you want to transform from words.
+   * Default: 100
    * @group param
    */
   final val vectorSize = new IntParam(
@@ -50,6 +51,7 @@ private[feature] trait Word2VecBase extends Params
 
   /**
    * Number of partitions for sentences of words.
+   * Default: 1
    * @group param
    */
   final val numPartitions = new IntParam(
@@ -62,6 +64,7 @@ private[feature] trait Word2VecBase extends Params
   /**
    * The minimum number of times a token must appear to be included in the word2vec model's
    * vocabulary.
+   * Default: 5
    * @group param
    */
   final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
new file mode 100644
index 0000000000000..c22d2e0cd2d90
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Feature transformers
+ *
+ * The `ml.feature` package provides common feature transformers that help convert raw data or
+ * features into more suitable forms for model fitting.
+ * Most feature transformers are implemented as {@link org.apache.spark.ml.Transformer}s, which
+ * transforms one {@link org.apache.spark.sql.DataFrame} into another, e.g.,
+ * {@link org.apache.spark.feature.HashingTF}.
+ * Some feature transformers are implemented as {@link org.apache.spark.ml.Estimator}}s, because the
+ * transformation requires some aggregated information of the dataset, e.g., document
+ * frequencies in {@link org.apache.spark.ml.feature.IDF}.
+ * For those feature transformers, calling {@link org.apache.spark.ml.Estimator#fit} is required to
+ * obtain the model first, e.g., {@link org.apache.spark.ml.feature.IDFModel}, in order to apply
+ * transformation.
+ * The transformation is usually done by appending new columns to the input
+ * {@link org.apache.spark.sql.DataFrame}, so all input columns are carried over.
+ *
+ * We try to make each transformer minimal, so it becomes flexible to assemble feature
+ * transformation pipelines.
+ * {@link org.apache.spark.ml.Pipeline} can be used to chain feature transformers, and
+ * {@link org.apache.spark.ml.feature.VectorAssembler} can be used to combine multiple feature
+ * transformations, for example:
+ *
+ * <pre>
+ * <code>
+ *   import java.util.Arrays;
+ *
+ *   import org.apache.spark.api.java.JavaRDD;
+ *   import static org.apache.spark.sql.types.DataTypes.*;
+ *   import org.apache.spark.sql.types.StructType;
+ *   import org.apache.spark.sql.DataFrame;
+ *   import org.apache.spark.sql.RowFactory;
+ *   import org.apache.spark.sql.Row;
+ *
+ *   import org.apache.spark.ml.feature.*;
+ *   import org.apache.spark.ml.Pipeline;
+ *   import org.apache.spark.ml.PipelineStage;
+ *   import org.apache.spark.ml.PipelineModel;
+ *
+ *  // a DataFrame with three columns: id (integer), text (string), and rating (double).
+ *  StructType schema = createStructType(
+ *    Arrays.asList(
+ *      createStructField("id", IntegerType, false),
+ *      createStructField("text", StringType, false),
+ *      createStructField("rating", DoubleType, false)));
+ *  JavaRDD<Row> rowRDD = jsc.parallelize(
+ *    Arrays.asList(
+ *      RowFactory.create(0, "Hi I heard about Spark", 3.0),
+ *      RowFactory.create(1, "I wish Java could use case classes", 4.0),
+ *      RowFactory.create(2, "Logistic regression models are neat", 4.0)));
+ *  DataFrame df = jsql.createDataFrame(rowRDD, schema);
+ *  // define feature transformers
+ *  RegexTokenizer tok = new RegexTokenizer()
+ *    .setInputCol("text")
+ *    .setOutputCol("words");
+ *  StopWordsRemover sw = new StopWordsRemover()
+ *    .setInputCol("words")
+ *    .setOutputCol("filtered_words");
+ *  HashingTF tf = new HashingTF()
+ *    .setInputCol("filtered_words")
+ *    .setOutputCol("tf")
+ *    .setNumFeatures(10000);
+ *  IDF idf = new IDF()
+ *    .setInputCol("tf")
+ *    .setOutputCol("tf_idf");
+ *  VectorAssembler assembler = new VectorAssembler()
+ *    .setInputCols(new String[] {"tf_idf", "rating"})
+ *    .setOutputCol("features");
+ *
+ *  // assemble and fit the feature transformation pipeline
+ *  Pipeline pipeline = new Pipeline()
+ *    .setStages(new PipelineStage[] {tok, sw, tf, idf, assembler});
+ *  PipelineModel model = pipeline.fit(df);
+ *
+ *  // save transformed features with raw data
+ *  model.transform(df)
+ *    .select("id", "text", "rating", "features")
+ *    .write().format("parquet").save("/output/path");
+ * </code>
+ * </pre>
+ *
+ * Some feature transformers implemented in MLlib are inspired by those implemented in scikit-learn.
+ * The major difference is that most scikit-learn feature transformers operate eagerly on the entire
+ * input dataset, while MLlib's feature transformers operate lazily on individual columns,
+ * which is more efficient and flexible to handle large and complex datasets.
+ *
+ * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html" target="_blank">
+ * scikit-learn.preprocessing</a>
+ */
+package org.apache.spark.ml.feature;
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
new file mode 100644
index 0000000000000..4374e99631560
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.optim
+
+import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
+import org.netlib.util.intW
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+import org.apache.spark.rdd.RDD
+
+/**
+ * Model fitted by [[WeightedLeastSquares]].
+ * @param coefficients model coefficients
+ * @param intercept model intercept
+ */
+private[ml] class WeightedLeastSquaresModel(
+    val coefficients: DenseVector,
+    val intercept: Double) extends Serializable
+
+/**
+ * Weighted least squares solver via normal equation.
+ * Given weighted observations (w,,i,,, a,,i,,, b,,i,,), we use the following weighted least squares
+ * formulation:
+ *
+ * min,,x,z,, 1/2 sum,,i,, w,,i,, (a,,i,,^T^ x + z - b,,i,,)^2^ / sum,,i,, w_i
+ *   + 1/2 lambda / delta sum,,j,, (sigma,,j,, x,,j,,)^2^,
+ *
+ * where lambda is the regularization parameter, and delta and sigma,,j,, are controlled by
+ * [[standardizeLabel]] and [[standardizeFeatures]], respectively.
+ *
+ * Set [[regParam]] to 0.0 and turn off both [[standardizeFeatures]] and [[standardizeLabel]] to
+ * match R's `lm`.
+ * Turn on [[standardizeLabel]] to match R's `glmnet`.
+ *
+ * @param fitIntercept whether to fit intercept. If false, z is 0.0.
+ * @param regParam L2 regularization parameter (lambda)
+ * @param standardizeFeatures whether to standardize features. If true, sigma_,,j,, is the
+ *                            population standard deviation of the j-th column of A. Otherwise,
+ *                            sigma,,j,, is 1.0.
+ * @param standardizeLabel whether to standardize label. If true, delta is the population standard
+ *                         deviation of the label column b. Otherwise, delta is 1.0.
+ */
+private[ml] class WeightedLeastSquares(
+    val fitIntercept: Boolean,
+    val regParam: Double,
+    val standardizeFeatures: Boolean,
+    val standardizeLabel: Boolean) extends Logging with Serializable {
+  import WeightedLeastSquares._
+
+  require(regParam >= 0.0, s"regParam cannot be negative: $regParam")
+  if (regParam == 0.0) {
+    logWarning("regParam is zero, which might cause numerical instability and overfitting.")
+  }
+
+  /**
+   * Creates a [[WeightedLeastSquaresModel]] from an RDD of [[Instance]]s.
+   */
+  def fit(instances: RDD[Instance]): WeightedLeastSquaresModel = {
+    val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_))
+    summary.validate()
+    logInfo(s"Number of instances: ${summary.count}.")
+    val triK = summary.triK
+    val bBar = summary.bBar
+    val bStd = summary.bStd
+    val aBar = summary.aBar
+    val aVar = summary.aVar
+    val abBar = summary.abBar
+    val aaBar = summary.aaBar
+    val aaValues = aaBar.values
+
+    if (fitIntercept) {
+      // shift centers
+      // A^T A - aBar aBar^T
+      BLAS.spr(-1.0, aBar, aaValues)
+      // A^T b - bBar aBar
+      BLAS.axpy(-bBar, aBar, abBar)
+    }
+
+    // add regularization to diagonals
+    var i = 0
+    var j = 2
+    while (i < triK) {
+      var lambda = regParam
+      if (standardizeFeatures) {
+        lambda *= aVar(j - 2)
+      }
+      if (standardizeLabel) {
+        // TODO: handle the case when bStd = 0
+        lambda /= bStd
+      }
+      aaValues(i) += lambda
+      i += j
+      j += 1
+    }
+
+    val x = choleskySolve(aaBar.values, abBar)
+
+    // compute intercept
+    val intercept = if (fitIntercept) {
+      bBar - BLAS.dot(aBar, x)
+    } else {
+      0.0
+    }
+
+    new WeightedLeastSquaresModel(x, intercept)
+  }
+
+  /**
+   * Solves a symmetric positive definite linear system via Cholesky factorization.
+   * The input arguments are modified in-place to store the factorization and the solution.
+   * @param A the upper triangular part of A
+   * @param bx right-hand side
+   * @return the solution vector
+   */
+  // TODO: SPARK-10490 - consolidate this and the Cholesky solver in ALS
+  private def choleskySolve(A: Array[Double], bx: DenseVector): DenseVector = {
+    val k = bx.size
+    val info = new intW(0)
+    lapack.dppsv("U", k, 1, A, bx.values, k, info)
+    val code = info.`val`
+    assert(code == 0, s"lapack.dpotrs returned $code.")
+    bx
+  }
+}
+
+private[ml] object WeightedLeastSquares {
+
+  /**
+   * Case class for weighted observations.
+   * @param w weight, must be positive
+   * @param a features
+   * @param b label
+   */
+  case class Instance(w: Double, a: Vector, b: Double) {
+    require(w >= 0.0, s"Weight cannot be negative: $w.")
+  }
+
+  /**
+   * Aggregator to provide necessary summary statistics for solving [[WeightedLeastSquares]].
+   */
+  // TODO: consolidate aggregates for summary statistics
+  private class Aggregator extends Serializable {
+    var initialized: Boolean = false
+    var k: Int = _
+    var count: Long = _
+    var triK: Int = _
+    private var wSum: Double = _
+    private var wwSum: Double = _
+    private var bSum: Double = _
+    private var bbSum: Double = _
+    private var aSum: DenseVector = _
+    private var abSum: DenseVector = _
+    private var aaSum: DenseVector = _
+
+    private def init(k: Int): Unit = {
+      require(k <= 4096, "In order to take the normal equation approach efficiently, " +
+        s"we set the max number of features to 4096 but got $k.")
+      this.k = k
+      triK = k * (k + 1) / 2
+      count = 0L
+      wSum = 0.0
+      wwSum = 0.0
+      bSum = 0.0
+      bbSum = 0.0
+      aSum = new DenseVector(Array.ofDim(k))
+      abSum = new DenseVector(Array.ofDim(k))
+      aaSum = new DenseVector(Array.ofDim(triK))
+      initialized = true
+    }
+
+    /**
+     * Adds an instance.
+     */
+    def add(instance: Instance): this.type = {
+      val Instance(w, a, b) = instance
+      val ak = a.size
+      if (!initialized) {
+        init(ak)
+      }
+      assert(ak == k, s"Dimension mismatch. Expect vectors of size $k but got $ak.")
+      count += 1L
+      wSum += w
+      wwSum += w * w
+      bSum += w * b
+      bbSum += w * b * b
+      BLAS.axpy(w, a, aSum)
+      BLAS.axpy(w * b, a, abSum)
+      BLAS.spr(w, a, aaSum)
+      this
+    }
+
+    /**
+     * Merges another [[Aggregator]].
+     */
+    def merge(other: Aggregator): this.type = {
+      if (!other.initialized) {
+        this
+      } else {
+        if (!initialized) {
+          init(other.k)
+        }
+        assert(k == other.k, s"dimension mismatch: this.k = $k but other.k = ${other.k}")
+        count += other.count
+        wSum += other.wSum
+        wwSum += other.wwSum
+        bSum += other.bSum
+        bbSum += other.bbSum
+        BLAS.axpy(1.0, other.aSum, aSum)
+        BLAS.axpy(1.0, other.abSum, abSum)
+        BLAS.axpy(1.0, other.aaSum, aaSum)
+        this
+      }
+    }
+
+    /**
+     * Validates that we have seen observations.
+     */
+    def validate(): Unit = {
+      assert(initialized, "Training dataset is empty.")
+      assert(wSum > 0.0, "Sum of weights cannot be zero.")
+    }
+
+    /**
+     * Weighted mean of features.
+     */
+    def aBar: DenseVector = {
+      val output = aSum.copy
+      BLAS.scal(1.0 / wSum, output)
+      output
+    }
+
+    /**
+     * Weighted mean of labels.
+     */
+    def bBar: Double = bSum / wSum
+
+    /**
+     * Weighted population standard deviation of labels.
+     */
+    def bStd: Double = math.sqrt(bbSum / wSum - bBar * bBar)
+
+    /**
+     * Weighted mean of (label * features).
+     */
+    def abBar: DenseVector = {
+      val output = abSum.copy
+      BLAS.scal(1.0 / wSum, output)
+      output
+    }
+
+    /**
+     * Weighted mean of (features * features^T^).
+     */
+    def aaBar: DenseVector = {
+      val output = aaSum.copy
+      BLAS.scal(1.0 / wSum, output)
+      output
+    }
+
+    /**
+     * Weighted population variance of features.
+     */
+    def aVar: DenseVector = {
+      val variance = Array.ofDim[Double](k)
+      var i = 0
+      var j = 2
+      val aaValues = aaSum.values
+      while (i < triK) {
+        val l = j - 2
+        val aw = aSum(l) / wSum
+        variance(l) = aaValues(i) / wSum - aw * aw
+        i += j
+        j += 1
+      }
+      new DenseVector(variance)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 91c0a5631319d..48f6269e57e98 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -65,7 +65,12 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
    */
   private[param] def validate(value: T): Unit = {
     if (!isValid(value)) {
-      throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value.")
+      val valueToString = value match {
+        case v: Array[_] => v.mkString("[", ",", "]")
+        case _ => value.toString
+      }
+      throw new IllegalArgumentException(
+        s"$parent parameter $name given invalid value $valueToString.")
     }
   }
 
@@ -461,7 +466,8 @@ trait Params extends Identifiable with Serializable {
    */
   final def getOrDefault[T](param: Param[T]): T = {
     shouldOwn(param)
-    get(param).orElse(getDefault(param)).get
+    get(param).orElse(getDefault(param)).getOrElse(
+      throw new NoSuchElementException(s"Failed to find a default value for ${param.name}"))
   }
 
   /** An alias for [[getOrDefault()]]. */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 8c16c6149b40d..8cb6b5493c61c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -42,7 +42,7 @@ private[shared] object SharedParamsCodeGen {
         Some("\"rawPrediction\"")),
       ParamDesc[String]("probabilityCol", "Column name for predicted class conditional" +
         " probabilities. Note: Not all models output well-calibrated probability estimates!" +
-        " These probabilities should be treated as confidences, not precise probabilities.",
+        " These probabilities should be treated as confidences, not precise probabilities",
         Some("\"probability\"")),
       ParamDesc[Double]("threshold",
         "threshold in binary classification prediction, in range [0, 1]", Some("0.5"),
@@ -56,18 +56,19 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("inputCol", "input column name"),
       ParamDesc[Array[String]]("inputCols", "input column names"),
       ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")),
-      ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
-        isValid = "ParamValidators.gtEq(1)"),
+      ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " +
+        "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " +
+        "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
         "will filter out rows with bad values), or error (which will throw an errror). More " +
         "options may be added later.",
         isValid = "ParamValidators.inArray(Array(\"skip\", \"error\"))"),
       ParamDesc[Boolean]("standardization", "whether to standardize the training features" +
-        " before fitting the model.", Some("true")),
+        " before fitting the model", Some("true")),
       ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
       ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
-        " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
+        " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty",
         isValid = "ParamValidators.inRange(0, 1)"),
       ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms"),
       ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization."),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index c26768953e3db..e3625212e5251 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -127,10 +127,10 @@ private[ml] trait HasRawPredictionCol extends Params {
 private[ml] trait HasProbabilityCol extends Params {
 
   /**
-   * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities..
+   * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
    * @group param
    */
-  final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.")
+  final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities")
 
   setDefault(probabilityCol, "probability")
 
@@ -223,10 +223,10 @@ private[ml] trait HasOutputCol extends Params {
 private[ml] trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for checkpoint interval (>= 1).
+   * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1))
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
 
   /** @group getParam */
   final def getCheckpointInterval: Int = $(checkpointInterval)
@@ -270,10 +270,10 @@ private[ml] trait HasHandleInvalid extends Params {
 private[ml] trait HasStandardization extends Params {
 
   /**
-   * Param for whether to standardize the training features before fitting the model..
+   * Param for whether to standardize the training features before fitting the model.
    * @group param
    */
-  final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features before fitting the model.")
+  final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features before fitting the model")
 
   setDefault(standardization, true)
 
@@ -304,10 +304,10 @@ private[ml] trait HasSeed extends Params {
 private[ml] trait HasElasticNetParam extends Params {
 
   /**
-   * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
+   * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
    * @group param
    */
-  final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1))
+  final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty", ParamValidators.inRange(0, 1))
 
   /** @group getParam */
   final def getElasticNetParam: Double = $(elasticNetParam)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 7db8ad8d27918..f6f5281f71a5f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -315,9 +315,9 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams {
 
   override def fit(dataset: DataFrame): ALSModel = {
     import dataset.sqlContext.implicits._
+    val r = if ($(ratingCol) != "") col($(ratingCol)).cast(FloatType) else lit(1.0f)
     val ratings = dataset
-      .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType),
-        col($(ratingCol)).cast(FloatType))
+      .select(col($(userCol)).cast(IntegerType), col($(itemCol)).cast(IntegerType), r)
       .map { row =>
         Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
       }
@@ -561,7 +561,7 @@ object ALS extends Logging {
     var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong())
     var previousCheckpointFile: Option[String] = None
     val shouldCheckpoint: Int => Boolean = (iter) =>
-      sc.checkpointDir.isDefined && (iter % checkpointInterval == 0)
+      sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0)
     val deletePreviousCheckpointFile: () => Unit = () =>
       previousCheckpointFile.foreach { file =>
         try {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
new file mode 100644
index 0000000000000..717caacad30eb
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -0,0 +1,482 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.regression
+
+import scala.collection.mutable
+
+import breeze.linalg.{DenseVector => BDV}
+import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS}
+
+import org.apache.spark.{SparkException, Logging}
+import org.apache.spark.annotation.{Since, Experimental}
+import org.apache.spark.ml.{Model, Estimator}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
+import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.mllib.linalg.BLAS
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.storage.StorageLevel
+
+/**
+ * Params for accelerated failure time (AFT) regression.
+ */
+private[regression] trait AFTSurvivalRegressionParams extends Params
+  with HasFeaturesCol with HasLabelCol with HasPredictionCol with HasMaxIter
+  with HasTol with HasFitIntercept with Logging {
+
+  /**
+   * Param for censor column name.
+   * The value of this column could be 0 or 1.
+   * If the value is 1, it means the event has occurred i.e. uncensored; otherwise censored.
+   * @group param
+   */
+  @Since("1.6.0")
+  final val censorCol: Param[String] = new Param(this, "censorCol", "censor column name")
+
+  /** @group getParam */
+  @Since("1.6.0")
+  def getCensorCol: String = $(censorCol)
+  setDefault(censorCol -> "censor")
+
+  /**
+   * Param for quantile probabilities array.
+   * Values of the quantile probabilities array should be in the range [0, 1]
+   * and the array should be non-empty.
+   * @group param
+   */
+  @Since("1.6.0")
+  final val quantileProbabilities: DoubleArrayParam = new DoubleArrayParam(this,
+    "quantileProbabilities", "quantile probabilities array",
+    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1)) && t.length > 0)
+
+  /** @group getParam */
+  @Since("1.6.0")
+  def getQuantileProbabilities: Array[Double] = $(quantileProbabilities)
+  setDefault(quantileProbabilities -> Array(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99))
+
+  /**
+   * Param for quantiles column name.
+   * This column will output quantiles of corresponding quantileProbabilities if it is set.
+   * @group param
+   */
+  @Since("1.6.0")
+  final val quantilesCol: Param[String] = new Param(this, "quantilesCol", "quantiles column name")
+
+  /** @group getParam */
+  @Since("1.6.0")
+  def getQuantilesCol: String = $(quantilesCol)
+
+  /** Checks whether the input has quantiles column name. */
+  protected[regression] def hasQuantilesCol: Boolean = {
+    isDefined(quantilesCol) && $(quantilesCol) != ""
+  }
+
+  /**
+   * Validates and transforms the input schema with the provided param map.
+   * @param schema input schema
+   * @param fitting whether this is in fitting or prediction
+   * @return output schema
+   */
+  protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean): StructType = {
+    SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+    if (fitting) {
+      SchemaUtils.checkColumnType(schema, $(censorCol), DoubleType)
+      SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+    }
+    if (hasQuantilesCol) {
+      SchemaUtils.appendColumn(schema, $(quantilesCol), new VectorUDT)
+    }
+    SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Fit a parametric survival regression model named accelerated failure time (AFT) model
+ * ([[https://en.wikipedia.org/wiki/Accelerated_failure_time_model]])
+ * based on the Weibull distribution of the survival time.
+ */
+@Experimental
+@Since("1.6.0")
+class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: String)
+  extends Estimator[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams with Logging {
+
+  @Since("1.6.0")
+  def this() = this(Identifiable.randomUID("aftSurvReg"))
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setCensorCol(value: String): this.type = set(censorCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantilesCol(value: String): this.type = set(quantilesCol, value)
+
+  /**
+   * Set if we should fit the intercept
+   * Default is true.
+   * @group setParam
+   */
+  @Since("1.6.0")
+  def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
+  setDefault(fitIntercept -> true)
+
+  /**
+   * Set the maximum number of iterations.
+   * Default is 100.
+   * @group setParam
+   */
+  @Since("1.6.0")
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  setDefault(maxIter -> 100)
+
+  /**
+   * Set the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-6.
+   * @group setParam
+   */
+  @Since("1.6.0")
+  def setTol(value: Double): this.type = set(tol, value)
+  setDefault(tol -> 1E-6)
+
+  /**
+   * Extract [[featuresCol]], [[labelCol]] and [[censorCol]] from input dataset,
+   * and put it in an RDD with strong types.
+   */
+  protected[ml] def extractAFTPoints(dataset: DataFrame): RDD[AFTPoint] = {
+    dataset.select($(featuresCol), $(labelCol), $(censorCol)).map {
+      case Row(features: Vector, label: Double, censor: Double) =>
+        AFTPoint(features, label, censor)
+    }
+  }
+
+  @Since("1.6.0")
+  override def fit(dataset: DataFrame): AFTSurvivalRegressionModel = {
+    validateAndTransformSchema(dataset.schema, fitting = true)
+    val instances = extractAFTPoints(dataset)
+    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
+
+    val costFun = new AFTCostFun(instances, $(fitIntercept))
+    val optimizer = new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
+
+    val numFeatures = dataset.select($(featuresCol)).take(1)(0).getAs[Vector](0).size
+    /*
+       The weights vector has three parts:
+       the first element: Double, log(sigma), the log of scale parameter
+       the second element: Double, intercept of the beta parameter
+       the third to the end elements: Doubles, regression coefficients vector of the beta parameter
+     */
+    val initialWeights = Vectors.zeros(numFeatures + 2)
+
+    val states = optimizer.iterations(new CachedDiffFunction(costFun),
+      initialWeights.toBreeze.toDenseVector)
+
+    val weights = {
+      val arrayBuilder = mutable.ArrayBuilder.make[Double]
+      var state: optimizer.State = null
+      while (states.hasNext) {
+        state = states.next()
+        arrayBuilder += state.adjustedValue
+      }
+      if (state == null) {
+        val msg = s"${optimizer.getClass.getName} failed."
+        throw new SparkException(msg)
+      }
+
+      state.x.toArray.clone()
+    }
+
+    if (handlePersistence) instances.unpersist()
+
+    val coefficients = Vectors.dense(weights.slice(2, weights.length))
+    val intercept = weights(1)
+    val scale = math.exp(weights(0))
+    val model = new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale)
+    copyValues(model.setParent(this))
+  }
+
+  @Since("1.6.0")
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema, fitting = true)
+  }
+
+  @Since("1.6.0")
+  override def copy(extra: ParamMap): AFTSurvivalRegression = defaultCopy(extra)
+}
+
+/**
+ * :: Experimental ::
+ * Model produced by [[AFTSurvivalRegression]].
+ */
+@Experimental
+@Since("1.6.0")
+class AFTSurvivalRegressionModel private[ml] (
+    @Since("1.6.0") override val uid: String,
+    @Since("1.6.0") val coefficients: Vector,
+    @Since("1.6.0") val intercept: Double,
+    @Since("1.6.0") val scale: Double)
+  extends Model[AFTSurvivalRegressionModel] with AFTSurvivalRegressionParams {
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantileProbabilities(value: Array[Double]): this.type = set(quantileProbabilities, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  def setQuantilesCol(value: String): this.type = set(quantilesCol, value)
+
+  @Since("1.6.0")
+  def predictQuantiles(features: Vector): Vector = {
+    // scale parameter for the Weibull distribution of lifetime
+    val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
+    // shape parameter for the Weibull distribution of lifetime
+    val k = 1 / scale
+    val quantiles = $(quantileProbabilities).map {
+      q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
+    }
+    Vectors.dense(quantiles)
+  }
+
+  @Since("1.6.0")
+  def predict(features: Vector): Double = {
+    math.exp(BLAS.dot(coefficients, features) + intercept)
+  }
+
+  @Since("1.6.0")
+  override def transform(dataset: DataFrame): DataFrame = {
+    transformSchema(dataset.schema)
+    val predictUDF = udf { features: Vector => predict(features) }
+    val predictQuantilesUDF = udf { features: Vector => predictQuantiles(features)}
+    if (hasQuantilesCol) {
+      dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+        .withColumn($(quantilesCol), predictQuantilesUDF(col($(featuresCol))))
+    } else {
+      dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+    }
+  }
+
+  @Since("1.6.0")
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema, fitting = false)
+  }
+
+  @Since("1.6.0")
+  override def copy(extra: ParamMap): AFTSurvivalRegressionModel = {
+    copyValues(new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale), extra)
+      .setParent(parent)
+  }
+}
+
+/**
+ * AFTAggregator computes the gradient and loss for a AFT loss function,
+ * as used in AFT survival regression for samples in sparse or dense vector in a online fashion.
+ *
+ * The loss function and likelihood function under the AFT model based on:
+ * Lawless, J. F., Statistical Models and Methods for Lifetime Data,
+ * New York: John Wiley & Sons, Inc. 2003.
+ *
+ * Two AFTAggregator can be merged together to have a summary of loss and gradient of
+ * the corresponding joint dataset.
+ *
+ * Given the values of the covariates x^{'}, for random lifetime t_{i} of subjects i = 1, ..., n,
+ * with possible right-censoring, the likelihood function under the AFT model is given as
+ * {{{
+ *   L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0}
+ *   (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0}
+ *   (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}}
+ * }}}
+ * Where \delta_{i} is the indicator of the event has occurred i.e. uncensored or not.
+ * Using \epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}, the log-likelihood function
+ * assumes the form
+ * {{{
+ *   \iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+
+ *   \delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}]
+ * }}}
+ * Where S_{0}(\epsilon_{i}) is the baseline survivor function,
+ * and f_{0}(\epsilon_{i}) is corresponding density function.
+ *
+ * The most commonly used log-linear survival regression method is based on the Weibull
+ * distribution of the survival time. The Weibull distribution for lifetime corresponding
+ * to extreme value distribution for log of the lifetime,
+ * and the S_{0}(\epsilon) function is
+ * {{{
+ *   S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}})
+ * }}}
+ * the f_{0}(\epsilon_{i}) function is
+ * {{{
+ *   f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}})
+ * }}}
+ * The log-likelihood function for Weibull distribution of lifetime is
+ * {{{
+ *   \iota(\beta,\sigma)=
+ *   -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}]
+ * }}}
+ * Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability,
+ * the loss function we use to optimize is -\iota(\beta,\sigma).
+ * The gradient functions for \beta and \log\sigma respectively are
+ * {{{
+ *   \frac{\partial (-\iota)}{\partial \beta}=
+ *   \sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma}
+ * }}}
+ * {{{
+ *   \frac{\partial (-\iota)}{\partial (\log\sigma)}=
+ *   \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
+ * }}}
+ * @param weights The log of scale parameter, the intercept and
+ *                regression coefficients corresponding to the features.
+ * @param fitIntercept Whether to fit an intercept term.
+ */
+private class AFTAggregator(weights: BDV[Double], fitIntercept: Boolean)
+  extends Serializable {
+
+  // beta is the intercept and regression coefficients to the covariates
+  private val beta = weights.slice(1, weights.length)
+  // sigma is the scale parameter of the AFT model
+  private val sigma = math.exp(weights(0))
+
+  private var totalCnt: Long = 0L
+  private var lossSum = 0.0
+  private var gradientBetaSum = BDV.zeros[Double](beta.length)
+  private var gradientLogSigmaSum = 0.0
+
+  def count: Long = totalCnt
+
+  def loss: Double = if (totalCnt == 0) 1.0 else lossSum / totalCnt
+
+  // Here we optimize loss function over beta and log(sigma)
+  def gradient: BDV[Double] = BDV.vertcat(BDV(Array(gradientLogSigmaSum / totalCnt.toDouble)),
+    gradientBetaSum/totalCnt.toDouble)
+
+  /**
+   * Add a new training data to this AFTAggregator, and update the loss and gradient
+   * of the objective function.
+   *
+   * @param data The AFTPoint representation for one data point to be added into this aggregator.
+   * @return This AFTAggregator object.
+   */
+  def add(data: AFTPoint): this.type = {
+
+    // TODO: Don't create a new xi vector each time.
+    val xi = if (fitIntercept) {
+      Vectors.dense(Array(1.0) ++ data.features.toArray).toBreeze
+    } else {
+      Vectors.dense(Array(0.0) ++ data.features.toArray).toBreeze
+    }
+    val ti = data.label
+    val delta = data.censor
+    val epsilon = (math.log(ti) - beta.dot(xi)) / sigma
+
+    lossSum += math.log(sigma) * delta
+    lossSum += (math.exp(epsilon) - delta * epsilon)
+
+    // Sanity check (should never occur):
+    assert(!lossSum.isInfinity,
+      s"AFTAggregator loss sum is infinity. Error for unknown reason.")
+
+    gradientBetaSum += xi * (delta - math.exp(epsilon)) / sigma
+    gradientLogSigmaSum += delta + (delta - math.exp(epsilon)) * epsilon
+
+    totalCnt += 1
+    this
+  }
+
+  /**
+   * Merge another AFTAggregator, and update the loss and gradient
+   * of the objective function.
+   * (Note that it's in place merging; as a result, `this` object will be modified.)
+   *
+   * @param other The other AFTAggregator to be merged.
+   * @return This AFTAggregator object.
+   */
+  def merge(other: AFTAggregator): this.type = {
+    if (totalCnt != 0) {
+      totalCnt += other.totalCnt
+      lossSum += other.lossSum
+
+      gradientBetaSum += other.gradientBetaSum
+      gradientLogSigmaSum += other.gradientLogSigmaSum
+    }
+    this
+  }
+}
+
+/**
+ * AFTCostFun implements Breeze's DiffFunction[T] for AFT cost.
+ * It returns the loss and gradient at a particular point (coefficients).
+ * It's used in Breeze's convex optimization routines.
+ */
+private class AFTCostFun(data: RDD[AFTPoint], fitIntercept: Boolean)
+  extends DiffFunction[BDV[Double]] {
+
+  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
+
+    val aftAggregator = data.treeAggregate(new AFTAggregator(coefficients, fitIntercept))(
+      seqOp = (c, v) => (c, v) match {
+        case (aggregator, instance) => aggregator.add(instance)
+      },
+      combOp = (c1, c2) => (c1, c2) match {
+        case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
+      })
+
+    (aftAggregator.loss, aftAggregator.gradient)
+  }
+}
+
+/**
+ * Class that represents the (features, label, censor) of a data point.
+ *
+ * @param features List of features for this data point.
+ * @param label Label for this data point.
+ * @param censor Indicator of the event has occurred or not. If the value is 1, it means
+ *                 the event has occurred i.e. uncensored; otherwise censored.
+ */
+private[regression] case class AFTPoint(features: Vector, label: Double, censor: Double) {
+  require(censor == 1.0 || censor == 0.0, "censor of class AFTPoint must be 1.0 or 0.0")
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index a2bcd67401d08..88b79a4eb82be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -96,7 +96,8 @@ object DecisionTreeRegressor {
 @Experimental
 final class DecisionTreeRegressionModel private[ml] (
     override val uid: String,
-    override val rootNode: Node)
+    override val rootNode: Node,
+    override val numFeatures: Int)
   extends PredictionModel[Vector, DecisionTreeRegressionModel]
   with DecisionTreeModel with Serializable {
 
@@ -107,18 +108,19 @@ final class DecisionTreeRegressionModel private[ml] (
    * Construct a decision tree regression model.
    * @param rootNode  Root node of tree, with other nodes attached.
    */
-  private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode)
+  private[ml] def this(rootNode: Node, numFeatures: Int) =
+    this(Identifiable.randomUID("dtr"), rootNode, numFeatures)
 
   override protected def predict(features: Vector): Double = {
     rootNode.predictImpl(features).prediction
   }
 
   override def copy(extra: ParamMap): DecisionTreeRegressionModel = {
-    copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra).setParent(parent)
+    copyValues(new DecisionTreeRegressionModel(uid, rootNode, numFeatures), extra).setParent(parent)
   }
 
   override def toString: String = {
-    s"DecisionTreeRegressionModel of depth $depth with $numNodes nodes"
+    s"DecisionTreeRegressionModel (uid=$uid) of depth $depth with $numNodes nodes"
   }
 
   /** Convert to a model in the old API */
@@ -133,12 +135,13 @@ private[ml] object DecisionTreeRegressionModel {
   def fromOld(
       oldModel: OldDecisionTreeModel,
       parent: DecisionTreeRegressor,
-      categoricalFeatures: Map[Int, Int]): DecisionTreeRegressionModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): DecisionTreeRegressionModel = {
     require(oldModel.algo == OldAlgo.Regression,
       s"Cannot convert non-regression DecisionTreeModel (old API) to" +
         s" DecisionTreeRegressionModel (new API).  Algo is: ${oldModel.algo}")
     val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtr")
-    new DecisionTreeRegressionModel(uid, rootNode)
+    new DecisionTreeRegressionModel(uid, rootNode, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index b66e61f37dd5e..65b5b3e0727df 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -128,10 +128,11 @@ final class GBTRegressor(override val uid: String)
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression)
     val oldGBT = new OldGBT(boostingStrategy)
     val oldModel = oldGBT.run(oldDataset)
-    GBTRegressionModel.fromOld(oldModel, this, categoricalFeatures)
+    GBTRegressionModel.fromOld(oldModel, this, categoricalFeatures, numFeatures)
   }
 
   override def copy(extra: ParamMap): GBTRegressor = defaultCopy(extra)
@@ -154,10 +155,11 @@ object GBTRegressor {
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
 @Experimental
-final class GBTRegressionModel(
+final class GBTRegressionModel private[ml](
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
-    private val _treeWeights: Array[Double])
+    private val _treeWeights: Array[Double],
+    override val numFeatures: Int)
   extends PredictionModel[Vector, GBTRegressionModel]
   with TreeEnsembleModel with Serializable {
 
@@ -165,6 +167,14 @@ final class GBTRegressionModel(
   require(_trees.length == _treeWeights.length, "GBTRegressionModel given trees, treeWeights of" +
     s" non-matching lengths (${_trees.length}, ${_treeWeights.length}, respectively).")
 
+  /**
+   * Construct a GBTRegressionModel
+   * @param _trees  Decision trees in the ensemble.
+   * @param _treeWeights  Weights for the decision trees in the ensemble.
+   */
+  def this(uid: String, _trees: Array[DecisionTreeRegressionModel], _treeWeights: Array[Double]) =
+    this(uid, _trees, _treeWeights, -1)
+
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
   override def treeWeights: Array[Double] = _treeWeights
@@ -185,11 +195,12 @@ final class GBTRegressionModel(
   }
 
   override def copy(extra: ParamMap): GBTRegressionModel = {
-    copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra).setParent(parent)
+    copyValues(new GBTRegressionModel(uid, _trees, _treeWeights, numFeatures),
+      extra).setParent(parent)
   }
 
   override def toString: String = {
-    s"GBTRegressionModel with $numTrees trees"
+    s"GBTRegressionModel (uid=$uid) with $numTrees trees"
   }
 
   /** (private[ml]) Convert to a model in the old API */
@@ -204,7 +215,8 @@ private[ml] object GBTRegressionModel {
   def fromOld(
       oldModel: OldGBTModel,
       parent: GBTRegressor,
-      categoricalFeatures: Map[Int, Int]): GBTRegressionModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): GBTRegressionModel = {
     require(oldModel.algo == OldAlgo.Regression, "Cannot convert GradientBoostedTreesModel" +
       s" with algo=${oldModel.algo} (old API) to GBTRegressionModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -212,6 +224,6 @@ private[ml] object GBTRegressionModel {
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtr")
-    new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights)
+    new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 0f33bae30e622..2ff500f291abc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -40,6 +40,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
   /**
    * Param for whether the output sequence should be isotonic/increasing (true) or
    * antitonic/decreasing (false).
+   * Default: true
    * @group param
    */
   final val isotonic: BooleanParam =
@@ -202,7 +203,7 @@ class IsotonicRegressionModel private[ml] (
   def predictions: Vector = Vectors.dense(oldModel.predictions)
 
   override def copy(extra: ParamMap): IsotonicRegressionModel = {
-    copyValues(new IsotonicRegressionModel(uid, oldModel), extra)
+    copyValues(new IsotonicRegressionModel(uid, oldModel), extra).setParent(parent)
   }
 
   override def transform(dataset: DataFrame): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 884003eb38524..a77e702141b3d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -31,21 +31,29 @@ import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.functions.{col, udf}
-import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.functions.{col, udf, lit}
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.StatCounter
 
 /**
  * Params for linear regression.
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
-    with HasFitIntercept with HasStandardization
+    with HasFitIntercept with HasStandardization with HasWeightCol
+
+/**
+ * Class that represents an instance of weighted data point with label and features.
+ *
+ * TODO: Refactor this class to proper place.
+ *
+ * @param label Label for this data point.
+ * @param weight The weight of this instance.
+ * @param features The vector of features for this data point.
+ */
+private[regression] case class Instance(label: Double, weight: Double, features: Vector)
 
 /**
  * :: Experimental ::
@@ -123,30 +131,43 @@ class LinearRegression(override val uid: String)
   def setTol(value: Double): this.type = set(tol, value)
   setDefault(tol -> 1E-6)
 
+  /**
+   * Whether to over-/under-sample training instances according to the given weights in weightCol.
+   * If empty, all instances are treated equally (weight 1.0).
+   * Default is empty, so all instances have weight one.
+   * @group setParam
+   */
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+  setDefault(weightCol -> "")
+
   override protected def train(dataset: DataFrame): LinearRegressionModel = {
     // Extract columns from data.  If dataset is persisted, do not persist instances.
-    val instances = extractLabeledPoints(dataset).map {
-      case LabeledPoint(label: Double, features: Vector) => (label, features)
+    val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
+    val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).map {
+      case Row(label: Double, weight: Double, features: Vector) =>
+        Instance(label, weight, features)
     }
+
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
-    val (summarizer, statCounter) = instances.treeAggregate(
-      (new MultivariateOnlineSummarizer, new StatCounter))(
-        seqOp = (c, v) => (c, v) match {
-          case ((summarizer: MultivariateOnlineSummarizer, statCounter: StatCounter),
-          (label: Double, features: Vector)) =>
-            (summarizer.add(features), statCounter.merge(label))
-      },
-        combOp = (c1, c2) => (c1, c2) match {
-          case ((summarizer1: MultivariateOnlineSummarizer, statCounter1: StatCounter),
-          (summarizer2: MultivariateOnlineSummarizer, statCounter2: StatCounter)) =>
-            (summarizer1.merge(summarizer2), statCounter1.merge(statCounter2))
-      })
-
-    val numFeatures = summarizer.mean.size
-    val yMean = statCounter.mean
-    val yStd = math.sqrt(statCounter.variance)
+    val (featuresSummarizer, ySummarizer) = {
+      val seqOp = (c: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
+        instance: Instance) =>
+          (c._1.add(instance.features, instance.weight),
+            c._2.add(Vectors.dense(instance.label), instance.weight))
+
+      val combOp = (c1: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
+        c2: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer)) =>
+          (c1._1.merge(c2._1), c1._2.merge(c2._2))
+
+      instances.treeAggregate(
+        new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)(seqOp, combOp)
+    }
+
+    val numFeatures = featuresSummarizer.mean.size
+    val yMean = ySummarizer.mean(0)
+    val yStd = math.sqrt(ySummarizer.variance(0))
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
@@ -167,8 +188,8 @@ class LinearRegression(override val uid: String)
       return copyValues(model.setSummary(trainingSummary))
     }
 
-    val featuresMean = summarizer.mean.toArray
-    val featuresStd = summarizer.variance.toArray.map(math.sqrt)
+    val featuresMean = featuresSummarizer.mean.toArray
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
 
     // Since we implicitly do the feature scaling when we compute the cost function
     // to improve the convergence, the effective regParam will be changed.
@@ -272,6 +293,8 @@ class LinearRegressionModel private[ml] (
 
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
 
+  override val numFeatures: Int = weights.size
+
   /**
    * Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is
    * thrown if `trainingSummary == None`.
@@ -310,7 +333,7 @@ class LinearRegressionModel private[ml] (
   }
 
   override def copy(extra: ParamMap): LinearRegressionModel = {
-    val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept))
+    val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
     if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
     newModel.setParent(parent)
   }
@@ -318,7 +341,8 @@ class LinearRegressionModel private[ml] (
 
 /**
  * :: Experimental ::
- * Linear regression training results.
+ * Linear regression training results. Currently, the training summary ignores the
+ * training weights except for the objective trace.
  * @param predictions predictions outputted by the model's `transform` method.
  * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
  */
@@ -477,7 +501,7 @@ class LinearRegressionSummary private[regression] (
  * \frac{\partial L}{\partial\w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i})
  * }}},
  *
- * @param weights The weights/coefficients corresponding to the features.
+ * @param coefficients The coefficients corresponding to the features.
  * @param labelStd The standard deviation value of the label.
  * @param labelMean The mean value of the label.
  * @param fitIntercept Whether to fit an intercept term.
@@ -485,7 +509,7 @@ class LinearRegressionSummary private[regression] (
  * @param featuresMean The mean values of the features.
  */
 private class LeastSquaresAggregator(
-    weights: Vector,
+    coefficients: Vector,
     labelStd: Double,
     labelMean: Double,
     fitIntercept: Boolean,
@@ -493,26 +517,28 @@ private class LeastSquaresAggregator(
     featuresMean: Array[Double]) extends Serializable {
 
   private var totalCnt: Long = 0L
+  private var weightSum: Double = 0.0
   private var lossSum = 0.0
 
-  private val (effectiveWeightsArray: Array[Double], offset: Double, dim: Int) = {
-    val weightsArray = weights.toArray.clone()
+  private val (effectiveCoefficientsArray: Array[Double], offset: Double, dim: Int) = {
+    val coefficientsArray = coefficients.toArray.clone()
     var sum = 0.0
     var i = 0
-    val len = weightsArray.length
+    val len = coefficientsArray.length
     while (i < len) {
       if (featuresStd(i) != 0.0) {
-        weightsArray(i) /=  featuresStd(i)
-        sum += weightsArray(i) * featuresMean(i)
+        coefficientsArray(i) /=  featuresStd(i)
+        sum += coefficientsArray(i) * featuresMean(i)
       } else {
-        weightsArray(i) = 0.0
+        coefficientsArray(i) = 0.0
       }
       i += 1
     }
-    (weightsArray, if (fitIntercept) labelMean / labelStd - sum else 0.0, weightsArray.length)
+    val offset = if (fitIntercept) labelMean / labelStd - sum else 0.0
+    (coefficientsArray, offset, coefficientsArray.length)
   }
 
-  private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
+  private val effectiveCoefficientsVector = Vectors.dense(effectiveCoefficientsArray)
 
   private val gradientSumArray = Array.ofDim[Double](dim)
 
@@ -520,30 +546,33 @@ private class LeastSquaresAggregator(
    * Add a new training data to this LeastSquaresAggregator, and update the loss and gradient
    * of the objective function.
    *
-   * @param label The label for this data point.
-   * @param data The features for one data point in dense/sparse vector format to be added
-   *             into this aggregator.
+   * @param instance  The data point instance to be added.
    * @return This LeastSquaresAggregator object.
    */
-  def add(label: Double, data: Vector): this.type = {
-    require(dim == data.size, s"Dimensions mismatch when adding new sample." +
-      s" Expecting $dim but got ${data.size}.")
+  def add(instance: Instance): this.type =
+    instance match { case Instance(label, weight, features) =>
+      require(dim == features.size, s"Dimensions mismatch when adding new sample." +
+        s" Expecting $dim but got ${features.size}.")
+      require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
+
+      if (weight == 0.0) return this
 
-    val diff = dot(data, effectiveWeightsVector) - label / labelStd + offset
+      val diff = dot(features, effectiveCoefficientsVector) - label / labelStd + offset
 
-    if (diff != 0) {
-      val localGradientSumArray = gradientSumArray
-      data.foreachActive { (index, value) =>
-        if (featuresStd(index) != 0.0 && value != 0.0) {
-          localGradientSumArray(index) += diff * value / featuresStd(index)
+      if (diff != 0) {
+        val localGradientSumArray = gradientSumArray
+        features.foreachActive { (index, value) =>
+          if (featuresStd(index) != 0.0 && value != 0.0) {
+            localGradientSumArray(index) += weight * diff * value / featuresStd(index)
+          }
         }
+        lossSum += weight * diff * diff / 2.0
       }
-      lossSum += diff * diff / 2.0
-    }
 
-    totalCnt += 1
-    this
-  }
+      totalCnt += 1
+      weightSum += weight
+      this
+    }
 
   /**
    * Merge another LeastSquaresAggregator, and update the loss and gradient
@@ -557,8 +586,9 @@ private class LeastSquaresAggregator(
     require(dim == other.dim, s"Dimensions mismatch when merging with another " +
       s"LeastSquaresAggregator. Expecting $dim but got ${other.dim}.")
 
-    if (other.totalCnt != 0) {
+    if (other.weightSum != 0) {
       totalCnt += other.totalCnt
+      weightSum += other.weightSum
       lossSum += other.lossSum
 
       var i = 0
@@ -574,11 +604,17 @@ private class LeastSquaresAggregator(
 
   def count: Long = totalCnt
 
-  def loss: Double = lossSum / totalCnt
+  def loss: Double = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
+    lossSum / weightSum
+  }
 
   def gradient: Vector = {
+    require(weightSum > 0.0, s"The effective number of instances should be " +
+      s"greater than 0.0, but $weightSum.")
     val result = Vectors.dense(gradientSumArray.clone())
-    scal(1.0 / totalCnt, result)
+    scal(1.0 / weightSum, result)
     result
   }
 }
@@ -589,7 +625,7 @@ private class LeastSquaresAggregator(
  * It's used in Breeze's convex optimization routines.
  */
 private class LeastSquaresCostFun(
-    data: RDD[(Double, Vector)],
+    data: RDD[Instance],
     labelStd: Double,
     labelMean: Double,
     fitIntercept: Boolean,
@@ -598,17 +634,13 @@ private class LeastSquaresCostFun(
     featuresMean: Array[Double],
     effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] {
 
-  override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = {
-    val w = Vectors.fromBreeze(weights)
+  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
+    val coeff = Vectors.fromBreeze(coefficients)
 
-    val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(w, labelStd,
+    val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(coeff, labelStd,
       labelMean, fitIntercept, featuresStd, featuresMean))(
-        seqOp = (c, v) => (c, v) match {
-          case (aggregator, (label, features)) => aggregator.add(label, features)
-        },
-        combOp = (c1, c2) => (c1, c2) match {
-          case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
-        })
+        seqOp = (aggregator, instance) => aggregator.add(instance),
+        combOp = (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
 
     val totalGradientArray = leastSquaresAggregator.gradient.toArray
 
@@ -616,7 +648,7 @@ private class LeastSquaresCostFun(
       0.0
     } else {
       var sum = 0.0
-      w.foreachActive { (index, value) =>
+      coeff.foreachActive { (index, value) =>
         // The following code will compute the loss of the regularization; also
         // the gradient of the regularization, and add back to totalGradientArray.
         sum += {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 2f36da371f577..64fc17247cce6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -115,7 +115,7 @@ object RandomForestRegressor {
 final class RandomForestRegressionModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeRegressionModel],
-    val numFeatures: Int)
+    override val numFeatures: Int)
   extends PredictionModel[Vector, RandomForestRegressionModel]
   with TreeEnsembleModel with Serializable {
 
@@ -155,7 +155,7 @@ final class RandomForestRegressionModel private[ml] (
   }
 
   override def toString: String = {
-    s"RandomForestRegressionModel with $numTrees trees"
+    s"RandomForestRegressionModel (uid=$uid) with $numTrees trees"
   }
 
   /**
@@ -187,13 +187,14 @@ private[ml] object RandomForestRegressionModel {
   def fromOld(
       oldModel: OldRandomForestModel,
       parent: RandomForestRegressor,
-      categoricalFeatures: Map[Int, Int]): RandomForestRegressionModel = {
+      categoricalFeatures: Map[Int, Int],
+      numFeatures: Int = -1): RandomForestRegressionModel = {
     require(oldModel.algo == OldAlgo.Regression, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestRegressionModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
       // parent for each tree is null since there is no good way to set this.
       DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures)
     }
-    new RandomForestRegressionModel(parent.uid, newTrees, -1)
+    new RandomForestRegressionModel(parent.uid, newTrees, numFeatures)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
new file mode 100644
index 0000000000000..1f627777fc68d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source.libsvm
+
+import com.google.common.base.Objects
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Since
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext}
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+
+/**
+ * LibSVMRelation provides the DataFrame constructed from LibSVM format data.
+ * @param path File path of LibSVM format
+ * @param numFeatures The number of features
+ * @param vectorType The type of vector. It can be 'sparse' or 'dense'
+ * @param sqlContext The Spark SQLContext
+ */
+private[libsvm] class LibSVMRelation(val path: String, val numFeatures: Int, val vectorType: String)
+    (@transient val sqlContext: SQLContext)
+  extends BaseRelation with TableScan with Logging with Serializable {
+
+  override def schema: StructType = StructType(
+    StructField("label", DoubleType, nullable = false) ::
+      StructField("features", new VectorUDT(), nullable = false) :: Nil
+  )
+
+  override def buildScan(): RDD[Row] = {
+    val sc = sqlContext.sparkContext
+    val baseRdd = MLUtils.loadLibSVMFile(sc, path, numFeatures)
+    val sparse = vectorType == "sparse"
+    baseRdd.map { pt =>
+      val features = if (sparse) pt.features.toSparse else pt.features.toDense
+      Row(pt.label, features)
+    }
+  }
+
+  override def hashCode(): Int = {
+    Objects.hashCode(path, Double.box(numFeatures), vectorType)
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case that: LibSVMRelation =>
+      path == that.path &&
+        numFeatures == that.numFeatures &&
+        vectorType == that.vectorType
+    case _ =>
+      false
+  }
+}
+
+/**
+ * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]].
+ * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and
+ * `features` containing feature vectors stored as [[Vector]]s.
+ *
+ * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and
+ * optionally specify options, for example:
+ * {{{
+ *   // Scala
+ *   val df = sqlContext.read.format("libsvm")
+ *     .option("numFeatures", "780")
+ *     .load("data/mllib/sample_libsvm_data.txt")
+ *
+ *   // Java
+ *   DataFrame df = sqlContext.read.format("libsvm")
+ *     .option("numFeatures, "780")
+ *     .load("data/mllib/sample_libsvm_data.txt");
+ * }}}
+ *
+ * LIBSVM data source supports the following options:
+ *  - "numFeatures": number of features.
+ *    If unspecified or nonpositive, the number of features will be determined automatically at the
+ *    cost of one additional pass.
+ *    This is also useful when the dataset is already split into multiple files and you want to load
+ *    them separately, because some features may not present in certain files, which leads to
+ *    inconsistent feature dimensions.
+ *  - "vectorType": feature vector type, "sparse" (default) or "dense".
+ *
+ *  @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
+ */
+@Since("1.6.0")
+class DefaultSource extends RelationProvider with DataSourceRegister {
+
+  @Since("1.6.0")
+  override def shortName(): String = "libsvm"
+
+  @Since("1.6.0")
+  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
+    : BaseRelation = {
+    val path = parameters.getOrElse("path",
+      throw new IllegalArgumentException("'path' must be specified"))
+    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
+    val vectorType = parameters.getOrElse("vectorType", "sparse")
+    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
index 488e8e4fb5dcd..1ee01131d6334 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
@@ -122,7 +122,7 @@ private[spark] class NodeIdCache(
     rddUpdateCount += 1
 
     // Handle checkpointing if the directory is not None.
-    if (canCheckpoint && (rddUpdateCount % checkpointInterval) == 0) {
+    if (canCheckpoint && checkpointInterval != -1 && (rddUpdateCount % checkpointInterval) == 0) {
       // Let's see if we can delete previous checkpoints.
       var canDelete = true
       while (checkpointQueue.size > 1 && canDelete) {
@@ -164,10 +164,10 @@ private[spark] class NodeIdCache(
         }
       }
     }
-  }
-  if (prevNodeIdsForInstances != null) {
-    // Unpersist the previous one if one exists.
-    prevNodeIdsForInstances.unpersist()
+    if (prevNodeIdsForInstances != null) {
+      // Unpersist the previous one if one exists.
+      prevNodeIdsForInstances.unpersist()
+    }
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 4ac51a475474a..c494556085e95 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -179,22 +179,28 @@ private[ml] object RandomForest extends Logging {
       }
     }
 
+    val numFeatures = metadata.numFeatures
+
     parentUID match {
       case Some(uid) =>
         if (strategy.algo == OldAlgo.Classification) {
           topNodes.map { rootNode =>
-            new DecisionTreeClassificationModel(uid, rootNode.toNode, strategy.getNumClasses)
+            new DecisionTreeClassificationModel(uid, rootNode.toNode, numFeatures,
+              strategy.getNumClasses)
           }
         } else {
-          topNodes.map(rootNode => new DecisionTreeRegressionModel(uid, rootNode.toNode))
+          topNodes.map { rootNode =>
+            new DecisionTreeRegressionModel(uid, rootNode.toNode, numFeatures)
+          }
         }
       case None =>
         if (strategy.algo == OldAlgo.Classification) {
           topNodes.map { rootNode =>
-            new DecisionTreeClassificationModel(rootNode.toNode, strategy.getNumClasses)
+            new DecisionTreeClassificationModel(rootNode.toNode, numFeatures,
+              strategy.getNumClasses)
           }
         } else {
-          topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode))
+          topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode, numFeatures))
         }
     }
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index dbd8d31571d2e..281ba6eeffa92 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.ml.tree
 
-import org.apache.spark.ml.classification.ClassifierParams
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed, HasThresholds}
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, BoostingStrategy => OldBoostingStrategy, Strategy => OldStrategy}
 import org.apache.spark.mllib.tree.impurity.{Entropy => OldEntropy, Gini => OldGini, Impurity => OldImpurity, Variance => OldVariance}
 import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
@@ -30,7 +29,7 @@ import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait DecisionTreeParams extends PredictorParams {
+private[ml] trait DecisionTreeParams extends PredictorParams with HasCheckpointInterval {
 
   /**
    * Maximum depth of the tree (>= 0).
@@ -87,7 +86,8 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   /**
    * If false, the algorithm will pass trees to executors to match instances with nodes.
    * If true, the algorithm will cache node IDs for each instance.
-   * Caching can speed up training of deeper trees.
+   * Caching can speed up training of deeper trees. Users can set how often should the
+   * cache be checkpointed or disable it by setting checkpointInterval.
    * (default = false)
    * @group expertParam
    */
@@ -96,21 +96,6 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
     " algorithm will cache node IDs for each instance. Caching can speed up training of deeper" +
     " trees.")
 
-  /**
-   * Specifies how often to checkpoint the cached node IDs.
-   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
-   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
-   * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
-   * (default = 10)
-   * @group expertParam
-   */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "Specifies" +
-    " how often to checkpoint the cached node IDs.  E.g. 10 means that the cache will get" +
-    " checkpointed every 10 iterations. This is only used if cacheNodeIds is true and if the" +
-    " checkpoint directory is set in the SparkContext. Must be >= 1.",
-    ParamValidators.gtEq(1))
-
   setDefault(maxDepth -> 5, maxBins -> 32, minInstancesPerNode -> 1, minInfoGain -> 0.0,
     maxMemoryInMB -> 256, cacheNodeIds -> false, checkpointInterval -> 10)
 
@@ -150,12 +135,17 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   /** @group expertGetParam */
   final def getCacheNodeIds: Boolean = $(cacheNodeIds)
 
-  /** @group expertSetParam */
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group expertSetParam
+   */
   def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
-  /** @group expertGetParam */
-  final def getCheckpointInterval: Int = $(checkpointInterval)
-
   /** (private[ml]) Create a Strategy instance to use with the old API. */
   private[ml] def getOldStrategy(
       categoricalFeatures: Map[Int, Int],
@@ -375,17 +365,7 @@ private[ml] object RandomForestParams {
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
-
-  /**
-   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
-   * estimator.
-   * (default = 0.1)
-   * @group param
-   */
-  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size (a.k.a." +
-    " learning rate) in interval (0, 1] for shrinking the contribution of each estimator",
-    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize {
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -403,11 +383,19 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
-  /** @group setParam */
+  /**
+   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
+   * estimator.
+   * (default = 0.1)
+   * @group setParam
+   */
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
-  /** @group getParam */
-  final def getStepSize: Double = $(stepSize)
+  override def validateParams(): Unit = {
+    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
+      getStepSize), "GBT parameter stepSize should be in interval (0, 1], " +
+      s"but it given invalid value $getStepSize.")
+  }
 
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index fcb517b5f735e..96a38a3bde960 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructField
 
 
 /**
- * Helper utilities for tree-based algorithms
+ * Helper utilities for algorithms using ML metadata
  */
 private[spark] object MetadataUtils {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index f585aacd452e0..69ce7f50709a1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -132,7 +132,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       regParam: Double,
       regType: String,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val lrAlg = new LinearRegressionWithSGD()
     lrAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -141,6 +142,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     lrAlg.optimizer.setUpdater(getUpdaterFromString(regType))
     trainRegressionModel(
       lrAlg,
@@ -159,7 +161,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       miniBatchFraction: Double,
       initialWeights: Vector,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val lassoAlg = new LassoWithSGD()
     lassoAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -168,6 +171,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     trainRegressionModel(
       lassoAlg,
       data,
@@ -185,7 +189,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       miniBatchFraction: Double,
       initialWeights: Vector,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val ridgeAlg = new RidgeRegressionWithSGD()
     ridgeAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -194,6 +199,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     trainRegressionModel(
       ridgeAlg,
       data,
@@ -212,7 +218,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       initialWeights: Vector,
       regType: String,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val SVMAlg = new SVMWithSGD()
     SVMAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -221,6 +228,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     SVMAlg.optimizer.setUpdater(getUpdaterFromString(regType))
     trainRegressionModel(
       SVMAlg,
@@ -240,7 +248,8 @@ private[python] class PythonMLLibAPI extends Serializable {
       regParam: Double,
       regType: String,
       intercept: Boolean,
-      validateData: Boolean): JList[Object] = {
+      validateData: Boolean,
+      convergenceTol: Double): JList[Object] = {
     val LogRegAlg = new LogisticRegressionWithSGD()
     LogRegAlg.setIntercept(intercept)
       .setValidateData(validateData)
@@ -249,6 +258,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setRegParam(regParam)
       .setStepSize(stepSize)
       .setMiniBatchFraction(miniBatchFraction)
+      .setConvergenceTol(convergenceTol)
     LogRegAlg.optimizer.setUpdater(getUpdaterFromString(regType))
     trainRegressionModel(
       LogRegAlg,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 7f6163e04bf17..a5902190d4637 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -168,10 +168,9 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
-      val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
-
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[Data](dataFrame.schema)
+      val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
 
       val (weights, gaussians) = dataArray.map {
         case Row(weight: Double, mu: Vector, sigma: Matrix) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 46920fffe6e1a..7168aac32c997 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -369,7 +369,7 @@ class KMeans private (
   : Array[Array[VectorWithNorm]] = {
     // Initialize empty centers and point costs.
     val centers = Array.tabulate(runs)(r => ArrayBuffer.empty[VectorWithNorm])
-    var costs = data.map(_ => Vectors.dense(Array.fill(runs)(Double.PositiveInfinity))).cache()
+    var costs = data.map(_ => Array.fill(runs)(Double.PositiveInfinity))
 
     // Initialize each run's first center to a random point.
     val seed = new XORShiftRandom(this.seed).nextInt()
@@ -394,21 +394,28 @@ class KMeans private (
       val bcNewCenters = data.context.broadcast(newCenters)
       val preCosts = costs
       costs = data.zip(preCosts).map { case (point, cost) =>
-        Vectors.dense(
           Array.tabulate(runs) { r =>
             math.min(KMeans.pointCost(bcNewCenters.value(r), point), cost(r))
-          })
-      }.cache()
+          }
+        }.persist(StorageLevel.MEMORY_AND_DISK)
       val sumCosts = costs
-        .aggregate(Vectors.zeros(runs))(
+        .aggregate(new Array[Double](runs))(
           seqOp = (s, v) => {
             // s += v
-            axpy(1.0, v, s)
+            var r = 0
+            while (r < runs) {
+              s(r) += v(r)
+              r += 1
+            }
             s
           },
           combOp = (s0, s1) => {
             // s0 += s1
-            axpy(1.0, s1, s0)
+            var r = 0
+            while (r < runs) {
+              s0(r) += s1(r)
+              r += 1
+            }
             s0
           }
         )
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 36b124c5d2966..58857c338f546 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -590,12 +590,10 @@ object Word2VecModel extends Loader[Word2VecModel] {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
-
-      val dataArray = dataFrame.select("word", "vector").collect()
-
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[Data](dataFrame.schema)
 
+      val dataArray = dataFrame.select("word", "vector").collect()
       val word2VecMap = dataArray.map(i => (i.getString(0), i.getSeq[Float](1).toArray)).toMap
       new Word2VecModel(word2VecMap)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 95c688c86a7e4..07eb750b06a3b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -142,5 +142,10 @@ object AssociationRules {
     def javaConsequent: java.util.List[Item] = {
       consequent.toList.asJava
     }
+
+    override def toString: String = {
+      s"${antecedent.mkString("{", ",", "}")} => " +
+        s"${consequent.mkString("{", ",", "}")}: ${confidence}"
+    }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index bbbcc8436b7c2..df9f4ae145b88 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -92,6 +92,13 @@ private[spark] object BLAS extends Serializable with Logging {
     }
   }
 
+  /** Y += a * x */
+  private[spark] def axpy(a: Double, X: DenseMatrix, Y: DenseMatrix): Unit = {
+    require(X.numRows == Y.numRows && X.numCols == Y.numCols, "Dimension mismatch: " +
+      s"size(X) = ${(X.numRows, X.numCols)} but size(Y) = ${(Y.numRows, Y.numCols)}.")
+    f2jBLAS.daxpy(X.numRows * X.numCols, a, X.values, 1, Y.values, 1)
+  }
+
   /**
    * dot(x, y)
    */
@@ -229,6 +236,50 @@ private[spark] object BLAS extends Serializable with Logging {
     _nativeBLAS
   }
 
+  /**
+   * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
+   *
+   * @param U the upper triangular part of the matrix in a [[DenseVector]](column major)
+   */
+  def spr(alpha: Double, v: Vector, U: DenseVector): Unit = {
+    spr(alpha, v, U.values)
+  }
+
+  /**
+   * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
+   *
+   * @param U the upper triangular part of the matrix packed in an array (column major)
+   */
+  def spr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
+    val n = v.size
+    v match {
+      case DenseVector(values) =>
+        NativeBLAS.dspr("U", n, alpha, values, 1, U)
+      case SparseVector(size, indices, values) =>
+        val nnz = indices.length
+        var colStartIdx = 0
+        var prevCol = 0
+        var col = 0
+        var j = 0
+        var i = 0
+        var av = 0.0
+        while (j < nnz) {
+          col = indices(j)
+          // Skip empty columns.
+          colStartIdx += (col - prevCol) * (col + prevCol + 1) / 2
+          col = indices(j)
+          av = alpha * values(j)
+          i = 0
+          while (i <= j) {
+            U(colStartIdx + indices(i)) += av * values(i)
+            i += 1
+          }
+          j += 1
+          prevCol = col
+        }
+    }
+  }
+
   /**
    * A := alpha * x * x^T^ + A
    * @param alpha a real scalar that will be multiplied to x * x^T^.
@@ -305,6 +356,8 @@ private[spark] object BLAS extends Serializable with Logging {
       "The matrix C cannot be the product of a transpose() call. C.isTransposed must be false.")
     if (alpha == 0.0 && beta == 1.0) {
       logDebug("gemm: alpha is equal to 0 and beta is equal to 1. Returning C.")
+    } else if (alpha == 0.0) {
+      f2jBLAS.dscal(C.values.length, beta, C.values, 1)
     } else {
       A match {
         case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C)
@@ -408,8 +461,8 @@ private[spark] object BLAS extends Serializable with Logging {
         }
       }
     } else {
-      // Scale matrix first if `beta` is not equal to 0.0
-      if (beta != 0.0) {
+      // Scale matrix first if `beta` is not equal to 1.0
+      if (beta != 1.0) {
         f2jBLAS.dscal(C.values.length, beta, C.values, 1)
       }
       // Perform matrix multiplication and add to C. The rows of A are multiplied by the columns of
@@ -470,8 +523,10 @@ private[spark] object BLAS extends Serializable with Logging {
       s"The columns of A don't match the number of elements of x. A: ${A.numCols}, x: ${x.size}")
     require(A.numRows == y.size,
       s"The rows of A don't match the number of elements of y. A: ${A.numRows}, y:${y.size}")
-    if (alpha == 0.0) {
-      logDebug("gemv: alpha is equal to 0. Returning y.")
+    if (alpha == 0.0 && beta == 1.0) {
+      logDebug("gemv: alpha is equal to 0 and beta is equal to 1. Returning y.")
+    } else if (alpha == 0.0) {
+      scal(beta, y)
     } else {
       (A, x) match {
         case (smA: SparseMatrix, dvx: DenseVector) =>
@@ -526,11 +581,6 @@ private[spark] object BLAS extends Serializable with Logging {
     val xValues = x.values
     val yValues = y.values
 
-    if (alpha == 0.0) {
-      scal(beta, y)
-      return
-    }
-
     if (A.isTransposed) {
       var rowCounterForA = 0
       while (rowCounterForA < mA) {
@@ -581,11 +631,6 @@ private[spark] object BLAS extends Serializable with Logging {
     val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs
     val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices
 
-    if (alpha == 0.0) {
-      scal(beta, y)
-      return
-    }
-
     if (A.isTransposed) {
       var rowCounter = 0
       while (rowCounter < mA) {
@@ -604,7 +649,7 @@ private[spark] object BLAS extends Serializable with Logging {
         rowCounter += 1
       }
     } else {
-      scal(beta, y)
+      if (beta != 1.0) scal(beta, y)
 
       var colCounterForA = 0
       var k = 0
@@ -659,7 +704,7 @@ private[spark] object BLAS extends Serializable with Logging {
         rowCounter += 1
       }
     } else {
-      scal(beta, y)
+      if (beta != 1.0) scal(beta, y)
       // Perform matrix-vector multiplication and add to y
       var colCounterForA = 0
       while (colCounterForA < nA) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 06ebb15869909..3642e9286504f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -71,20 +71,22 @@ sealed trait Vector extends Serializable {
   }
 
   /**
-   * Returns a hash code value for the vector. The hash code is based on its size and its nonzeros
-   * in the first 16 entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
+   * Returns a hash code value for the vector. The hash code is based on its size and its first 128
+   * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
    */
   override def hashCode(): Int = {
     // This is a reference implementation. It calls return in foreachActive, which is slow.
     // Subclasses should override it with optimized implementation.
     var result: Int = 31 + size
+    var nnz = 0
     this.foreachActive { (index, value) =>
-      if (index < 16) {
+      if (nnz < Vectors.MAX_HASH_NNZ) {
         // ignore explicit 0 for comparison between sparse and dense
         if (value != 0) {
           result = 31 * result + index
           val bits = java.lang.Double.doubleToLongBits(value)
           result = 31 * result + (bits ^ (bits >>> 32)).toInt
+          nnz += 1
         }
       } else {
         return result
@@ -536,6 +538,9 @@ object Vectors {
     }
     allEqual
   }
+
+  /** Max number of nonzero entries used in computing hash code. */
+  private[linalg] val MAX_HASH_NNZ = 128
 }
 
 /**
@@ -578,13 +583,15 @@ class DenseVector @Since("1.0.0") (
   override def hashCode(): Int = {
     var result: Int = 31 + size
     var i = 0
-    val end = math.min(values.length, 16)
-    while (i < end) {
+    val end = values.length
+    var nnz = 0
+    while (i < end && nnz < Vectors.MAX_HASH_NNZ) {
       val v = values(i)
       if (v != 0.0) {
         result = 31 * result + i
         val bits = java.lang.Double.doubleToLongBits(values(i))
         result = 31 * result + (bits ^ (bits >>> 32)).toInt
+        nnz += 1
       }
       i += 1
     }
@@ -707,19 +714,16 @@ class SparseVector @Since("1.0.0") (
   override def hashCode(): Int = {
     var result: Int = 31 + size
     val end = values.length
-    var continue = true
     var k = 0
-    while ((k < end) & continue) {
-      val i = indices(k)
-      if (i < 16) {
-        val v = values(k)
-        if (v != 0.0) {
-          result = 31 * result + i
-          val bits = java.lang.Double.doubleToLongBits(v)
-          result = 31 * result + (bits ^ (bits >>> 32)).toInt
-        }
-      } else {
-        continue = false
+    var nnz = 0
+    while (k < end && nnz < Vectors.MAX_HASH_NNZ) {
+      val v = values(k)
+      if (v != 0.0) {
+        val i = indices(k)
+        result = 31 * result + i
+        val bits = java.lang.Double.doubleToLongBits(v)
+        result = 31 * result + (bits ^ (bits >>> 32)).toInt
+        nnz += 1
       }
       k += 1
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 9a423ddafdc09..7c7d900af3d5a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable.ListBuffer
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy,
   svd => brzSvd, MatrixSingularException, inv}
 import breeze.numerics.{sqrt => brzSqrt}
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext._
@@ -110,7 +109,8 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`.
+   * Computes the Gramian matrix `A^T A`. Note that this cannot be computed on matrices with
+   * more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
@@ -123,7 +123,7 @@ class RowMatrix @Since("1.0.0") (
     // Compute the upper triangular part of the gram matrix.
     val GU = rows.treeAggregate(new BDV[Double](new Array[Double](nt)))(
       seqOp = (U, v) => {
-        RowMatrix.dspr(1.0, v, U.data)
+        BLAS.spr(1.0, v, U.data)
         U
       }, combOp = (U1, U2) => U1 += U2)
 
@@ -151,7 +151,8 @@ class RowMatrix @Since("1.0.0") (
    *  - s is a Vector of size k, holding the singular values in descending order,
    *  - V is a Matrix of size n x k that satisfies V' * V = eye(k).
    *
-   * We assume n is smaller than m. The singular values and the right singular vectors are derived
+   * We assume n is smaller than m, though this is not strictly required.
+   * The singular values and the right singular vectors are derived
    * from the eigenvalues and the eigenvectors of the Gramian matrix A' * A. U, the matrix
    * storing the right singular vectors, is computed via matrix multiplication as
    * U = A * (V * S^-1^), if requested by user. The actual method to use is determined
@@ -321,7 +322,8 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the covariance matrix, treating each row as an observation.
+   * Computes the covariance matrix, treating each row as an observation. Note that this cannot
+   * be computed on matrices with more than 65535 columns.
    * @return a local dense matrix of size n x n
    */
   @Since("1.0.0")
@@ -375,6 +377,8 @@ class RowMatrix @Since("1.0.0") (
    * The row data do not need to be "centered" first; it is not necessary for
    * the mean of each column to be 0.
    *
+   * Note that this cannot be computed on matrices with more than 65535 columns.
+   *
    * @param k number of top principal components.
    * @return a matrix of size n-by-k, whose columns are principal components
    */
@@ -673,42 +677,6 @@ class RowMatrix @Since("1.0.0") (
 @Experimental
 object RowMatrix {
 
-  /**
-   * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's DSPR.
-   *
-   * @param U the upper triangular part of the matrix packed in an array (column major)
-   */
-  private def dspr(alpha: Double, v: Vector, U: Array[Double]): Unit = {
-    // TODO: Find a better home (breeze?) for this method.
-    val n = v.size
-    v match {
-      case DenseVector(values) =>
-        blas.dspr("U", n, alpha, values, 1, U)
-      case SparseVector(size, indices, values) =>
-        val nnz = indices.length
-        var colStartIdx = 0
-        var prevCol = 0
-        var col = 0
-        var j = 0
-        var i = 0
-        var av = 0.0
-        while (j < nnz) {
-          col = indices(j)
-          // Skip empty columns.
-          colStartIdx += (col - prevCol) * (col + prevCol + 1) / 2
-          col = indices(j)
-          av = alpha * values(j)
-          i = 0
-          while (i <= j) {
-            U(colStartIdx + indices(i)) += av * values(i)
-            i += 1
-          }
-          j += 1
-          prevCol = col
-        }
-    }
-  }
-
   /**
    * Fills a full square matrix from its upper triangular part.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index a2d85a68cd327..9eab7efc160da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.mllib.random
 
-import org.apache.commons.math3.distribution.{ExponentialDistribution,
-  GammaDistribution, LogNormalDistribution, PoissonDistribution}
+import org.apache.commons.math3.distribution._
 
 import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
@@ -195,3 +194,27 @@ class LogNormalGenerator @Since("1.3.0") (
   @Since("1.3.0")
   override def copy(): LogNormalGenerator = new LogNormalGenerator(mean, std)
 }
+
+/**
+ * :: DeveloperApi ::
+ * Generates i.i.d. samples from the Weibull distribution with the
+ * given shape and scale parameter.
+ *
+ * @param alpha shape parameter for the Weibull distribution.
+ * @param beta scale parameter for the Weibull distribution.
+ */
+@DeveloperApi
+class WeibullGenerator(
+    val alpha: Double,
+    val beta: Double) extends RandomDataGenerator[Double] {
+
+  private val rng = new WeibullDistribution(alpha, beta)
+
+  override def nextValue(): Double = rng.sample()
+
+  override def setSeed(seed: Long): Unit = {
+    rng.reseedRandomGenerator(seed)
+  }
+
+  override def copy(): WeibullGenerator = new WeibullGenerator(alpha, beta)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 4dd5ea214d678..41d7c4d355f61 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -22,6 +22,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD, JavaSparkContext}
+import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.rdd.{RandomRDD, RandomVectorRDD}
 import org.apache.spark.rdd.RDD
@@ -381,7 +382,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of `i.i.d.` samples produced by generator.
+   * @return RDD[T] comprised of `i.i.d.` samples produced by generator.
    */
   @DeveloperApi
   @Since("1.1.0")
@@ -394,6 +395,55 @@ object RandomRDDs {
     new RandomRDD[T](sc, size, numPartitionsOrDefault(sc, numPartitions), generator, seed)
   }
 
+  /**
+   * :: DeveloperApi ::
+   * Generates an RDD comprised of `i.i.d.` samples produced by the input RandomDataGenerator.
+   *
+   * @param jsc JavaSparkContext used to create the RDD.
+   * @param generator RandomDataGenerator used to populate the RDD.
+   * @param size Size of the RDD.
+   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
+   * @param seed Random seed (default: a random long integer).
+   * @return RDD[T] comprised of `i.i.d.` samples produced by generator.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaRDD[T](
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[T],
+      size: Long,
+      numPartitions: Int,
+      seed: Long): JavaRDD[T] = {
+    implicit val ctag: ClassTag[T] = fakeClassTag
+    val rdd = randomRDD(jsc.sc, generator, size, numPartitions, seed)
+    JavaRDD.fromRDD(rdd)
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaRDD]] with the default seed.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaRDD[T](
+    jsc: JavaSparkContext,
+    generator: RandomDataGenerator[T],
+    size: Long,
+    numPartitions: Int): JavaRDD[T] = {
+    randomJavaRDD(jsc, generator, size, numPartitions, Utils.random.nextLong())
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaRDD]] with the default seed & numPartitions
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaRDD[T](
+    jsc: JavaSparkContext,
+    generator: RandomDataGenerator[T],
+    size: Long): JavaRDD[T] = {
+    randomJavaRDD(jsc, generator, size, 0);
+  }
+
   // TODO Generate RDD[Vector] from multivariate distributions.
 
   /**
@@ -805,6 +855,48 @@ object RandomRDDs {
       sc, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), generator, seed)
   }
 
+  /**
+   * Java-friendly version of [[RandomRDDs#randomVectorRDD]].
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaVectorRDD(
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      seed: Long): JavaRDD[Vector] = {
+    randomVectorRDD(jsc.sc, generator, numRows, numCols, numPartitions, seed).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaVectorRDD]] with the default seed.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaVectorRDD(
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int): JavaRDD[Vector] = {
+    randomVectorRDD(jsc.sc, generator, numRows, numCols, numPartitions).toJavaRDD()
+  }
+
+  /**
+   * [[RandomRDDs#randomJavaVectorRDD]] with the default number of partitions and the default seed.
+   */
+  @DeveloperApi
+  @Since("1.6.0")
+  def randomJavaVectorRDD(
+      jsc: JavaSparkContext,
+      generator: RandomDataGenerator[Double],
+      numRows: Long,
+      numCols: Int): JavaRDD[Vector] = {
+    randomVectorRDD(jsc.sc, generator, numRows, numCols).toJavaRDD()
+  }
+
   /**
    * Returns `numPartitions` if it is positive, or `sc.defaultParallelism` otherwise.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
index 910eff9540a47..f8cea7ecea6bf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala
@@ -35,11 +35,11 @@ private[mllib] class RandomRDDPartition[T](override val index: Int,
 }
 
 // These two classes are necessary since Range objects in Scala cannot have size > Int.MaxValue
-private[mllib] class RandomRDD[T: ClassTag](@transient sc: SparkContext,
+private[mllib] class RandomRDD[T: ClassTag](sc: SparkContext,
     size: Long,
     numPartitions: Int,
-    @transient rng: RandomDataGenerator[T],
-    @transient seed: Long = Utils.random.nextLong) extends RDD[T](sc, Nil) {
+    @transient private val rng: RandomDataGenerator[T],
+    @transient private val seed: Long = Utils.random.nextLong) extends RDD[T](sc, Nil) {
 
   require(size > 0, "Positive RDD size required.")
   require(numPartitions > 0, "Positive number of partitions required")
@@ -56,12 +56,12 @@ private[mllib] class RandomRDD[T: ClassTag](@transient sc: SparkContext,
   }
 }
 
-private[mllib] class RandomVectorRDD(@transient sc: SparkContext,
+private[mllib] class RandomVectorRDD(sc: SparkContext,
     size: Long,
     vectorSize: Int,
     numPartitions: Int,
-    @transient rng: RandomDataGenerator[Double],
-    @transient seed: Long = Utils.random.nextLong) extends RDD[Vector](sc, Nil) {
+    @transient private val rng: RandomDataGenerator[Double],
+    @transient private val seed: Long = Utils.random.nextLong) extends RDD[Vector](sc, Nil) {
 
   require(size > 0, "Positive RDD size required.")
   require(numPartitions > 0, "Positive number of partitions required")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 51b713e263e0c..201333c3690df 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -23,16 +23,19 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
 /**
  * :: DeveloperApi ::
  * MultivariateOnlineSummarizer implements [[MultivariateStatisticalSummary]] to compute the mean,
- * variance, minimum, maximum, counts, and nonzero counts for samples in sparse or dense vector
+ * variance, minimum, maximum, counts, and nonzero counts for instances in sparse or dense vector
  * format in a online fashion.
  *
  * Two MultivariateOnlineSummarizer can be merged together to have a statistical summary of
  * the corresponding joint dataset.
  *
- * A numerically stable algorithm is implemented to compute sample mean and variance:
+ * A numerically stable algorithm is implemented to compute the mean and variance of instances:
  * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
  * Zero elements (including explicit zero values) are skipped when calling add(),
  * to have time complexity O(nnz) instead of O(n) for each column.
+ *
+ * For weighted instances, the unbiased estimation of variance is defined by the reliability
+ * weights: [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]].
  */
 @Since("1.1.0")
 @DeveloperApi
@@ -44,6 +47,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   private var currM2: Array[Double] = _
   private var currL1: Array[Double] = _
   private var totalCnt: Long = 0
+  private var weightSum: Double = 0.0
+  private var weightSquareSum: Double = 0.0
   private var nnz: Array[Double] = _
   private var currMax: Array[Double] = _
   private var currMin: Array[Double] = _
@@ -55,10 +60,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    * @return This MultivariateOnlineSummarizer object.
    */
   @Since("1.1.0")
-  def add(sample: Vector): this.type = {
+  def add(sample: Vector): this.type = add(sample, 1.0)
+
+  private[spark] def add(instance: Vector, weight: Double): this.type = {
+    require(weight >= 0.0, s"sample weight, ${weight} has to be >= 0.0")
+    if (weight == 0.0) return this
+
     if (n == 0) {
-      require(sample.size > 0, s"Vector should have dimension larger than zero.")
-      n = sample.size
+      require(instance.size > 0, s"Vector should have dimension larger than zero.")
+      n = instance.size
 
       currMean = Array.ofDim[Double](n)
       currM2n = Array.ofDim[Double](n)
@@ -69,8 +79,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       currMin = Array.fill[Double](n)(Double.MaxValue)
     }
 
-    require(n == sample.size, s"Dimensions mismatch when adding new sample." +
-      s" Expecting $n but got ${sample.size}.")
+    require(n == instance.size, s"Dimensions mismatch when adding new sample." +
+      s" Expecting $n but got ${instance.size}.")
 
     val localCurrMean = currMean
     val localCurrM2n = currM2n
@@ -79,7 +89,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     val localNnz = nnz
     val localCurrMax = currMax
     val localCurrMin = currMin
-    sample.foreachActive { (index, value) =>
+    instance.foreachActive { (index, value) =>
       if (value != 0.0) {
         if (localCurrMax(index) < value) {
           localCurrMax(index) = value
@@ -90,15 +100,17 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
 
         val prevMean = localCurrMean(index)
         val diff = value - prevMean
-        localCurrMean(index) = prevMean + diff / (localNnz(index) + 1.0)
-        localCurrM2n(index) += (value - localCurrMean(index)) * diff
-        localCurrM2(index) += value * value
-        localCurrL1(index) += math.abs(value)
+        localCurrMean(index) = prevMean + weight * diff / (localNnz(index) + weight)
+        localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff
+        localCurrM2(index) += weight * value * value
+        localCurrL1(index) += weight * math.abs(value)
 
-        localNnz(index) += 1.0
+        localNnz(index) += weight
       }
     }
 
+    weightSum += weight
+    weightSquareSum += weight * weight
     totalCnt += 1
     this
   }
@@ -112,10 +124,12 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   def merge(other: MultivariateOnlineSummarizer): this.type = {
-    if (this.totalCnt != 0 && other.totalCnt != 0) {
+    if (this.weightSum != 0.0 && other.weightSum != 0.0) {
       require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
         s"Expecting $n but got ${other.n}.")
       totalCnt += other.totalCnt
+      weightSum += other.weightSum
+      weightSquareSum += other.weightSquareSum
       var i = 0
       while (i < n) {
         val thisNnz = nnz(i)
@@ -138,13 +152,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
         nnz(i) = totalNnz
         i += 1
       }
-    } else if (totalCnt == 0 && other.totalCnt != 0) {
+    } else if (weightSum == 0.0 && other.weightSum != 0.0) {
       this.n = other.n
       this.currMean = other.currMean.clone()
       this.currM2n = other.currM2n.clone()
       this.currM2 = other.currM2.clone()
       this.currL1 = other.currL1.clone()
       this.totalCnt = other.totalCnt
+      this.weightSum = other.weightSum
+      this.weightSquareSum = other.weightSquareSum
       this.nnz = other.nnz.clone()
       this.currMax = other.currMax.clone()
       this.currMin = other.currMin.clone()
@@ -158,28 +174,28 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def mean: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     val realMean = Array.ofDim[Double](n)
     var i = 0
     while (i < n) {
-      realMean(i) = currMean(i) * (nnz(i) / totalCnt)
+      realMean(i) = currMean(i) * (nnz(i) / weightSum)
       i += 1
     }
     Vectors.dense(realMean)
   }
 
   /**
-   * Sample variance of each dimension.
+   * Unbiased estimate of sample variance of each dimension.
    *
    */
   @Since("1.1.0")
   override def variance: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     val realVariance = Array.ofDim[Double](n)
 
-    val denominator = totalCnt - 1.0
+    val denominator = weightSum - (weightSquareSum / weightSum)
 
     // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
     if (denominator > 0.0) {
@@ -187,9 +203,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       var i = 0
       val len = currM2n.length
       while (i < len) {
-        realVariance(i) =
-          currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt
-        realVariance(i) /= denominator
+        realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) *
+          (weightSum - nnz(i)) / weightSum) / denominator
         i += 1
       }
     }
@@ -209,7 +224,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def numNonzeros: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     Vectors.dense(nnz)
   }
@@ -220,11 +235,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def max: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     var i = 0
     while (i < n) {
-      if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0
+      if ((nnz(i) < weightSum) && (currMax(i) < 0.0)) currMax(i) = 0.0
       i += 1
     }
     Vectors.dense(currMax)
@@ -236,11 +251,11 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.1.0")
   override def min: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     var i = 0
     while (i < n) {
-      if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0
+      if ((nnz(i) < weightSum) && (currMin(i) > 0.0)) currMin(i) = 0.0
       i += 1
     }
     Vectors.dense(currMin)
@@ -252,7 +267,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.2.0")
   override def normL2: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     val realMagnitude = Array.ofDim[Double](n)
 
@@ -271,7 +286,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    */
   @Since("1.2.0")
   override def normL1: Vector = {
-    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+    require(weightSum > 0, s"Nothing has been added to this summarizer.")
 
     Vectors.dense(currL1)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
new file mode 100644
index 0000000000000..75c6a51d09571
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.StatCounter
+
+/**
+ * :: Experimental ::
+ * Performs online 2-sample significance testing for a stream of (Boolean, Double) pairs. The
+ * Boolean identifies which sample each observation comes from, and the Double is the numeric value
+ * of the observation.
+ *
+ * To address novelty affects, the `peacePeriod` specifies a set number of initial
+ * [[org.apache.spark.rdd.RDD]] batches of the [[DStream]] to be dropped from significance testing.
+ *
+ * The `windowSize` sets the number of batches each significance test is to be performed over. The
+ * window is sliding with a stride length of 1 batch. Setting windowSize to 0 will perform
+ * cumulative processing, using all batches seen so far.
+ *
+ * Different tests may be used for assessing statistical significance depending on assumptions
+ * satisfied by data. For more details, see [[StreamingTestMethod]]. The `testMethod` specifies
+ * which test will be used.
+ *
+ * Use a builder pattern to construct a streaming test in an application, for example:
+ * {{{
+ *   val model = new StreamingTest()
+ *     .setPeacePeriod(10)
+ *     .setWindowSize(0)
+ *     .setTestMethod("welch")
+ *     .registerStream(DStream)
+ * }}}
+ */
+@Experimental
+@Since("1.6.0")
+class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
+  private var peacePeriod: Int = 0
+  private var windowSize: Int = 0
+  private var testMethod: StreamingTestMethod = WelchTTest
+
+  /** Set the number of initial batches to ignore. Default: 0. */
+  @Since("1.6.0")
+  def setPeacePeriod(peacePeriod: Int): this.type = {
+    this.peacePeriod = peacePeriod
+    this
+  }
+
+  /**
+   * Set the number of batches to compute significance tests over. Default: 0.
+   * A value of 0 will use all batches seen so far.
+   */
+  @Since("1.6.0")
+  def setWindowSize(windowSize: Int): this.type = {
+    this.windowSize = windowSize
+    this
+  }
+
+  /** Set the statistical method used for significance testing. Default: "welch" */
+  @Since("1.6.0")
+  def setTestMethod(method: String): this.type = {
+    this.testMethod = StreamingTestMethod.getTestMethodFromName(method)
+    this
+  }
+
+  /**
+   * Register a [[DStream]] of values for significance testing.
+   *
+   * @param data stream of (key,value) pairs where the key denotes group membership (true =
+   *             experiment, false = control) and the value is the numerical metric to test for
+   *             significance
+   * @return stream of significance testing results
+   */
+  @Since("1.6.0")
+  def registerStream(data: DStream[(Boolean, Double)]): DStream[StreamingTestResult] = {
+    val dataAfterPeacePeriod = dropPeacePeriod(data)
+    val summarizedData = summarizeByKeyAndWindow(dataAfterPeacePeriod)
+    val pairedSummaries = pairSummaries(summarizedData)
+
+    testMethod.doTest(pairedSummaries)
+  }
+
+  /** Drop all batches inside the peace period. */
+  private[stat] def dropPeacePeriod(
+      data: DStream[(Boolean, Double)]): DStream[(Boolean, Double)] = {
+    data.transform { (rdd, time) =>
+      if (time.milliseconds > data.slideDuration.milliseconds * peacePeriod) {
+        rdd
+      } else {
+        data.context.sparkContext.parallelize(Seq())
+      }
+    }
+  }
+
+  /** Compute summary statistics over each key and the specified test window size. */
+  private[stat] def summarizeByKeyAndWindow(
+      data: DStream[(Boolean, Double)]): DStream[(Boolean, StatCounter)] = {
+    if (this.windowSize == 0) {
+      data.updateStateByKey[StatCounter](
+        (newValues: Seq[Double], oldSummary: Option[StatCounter]) => {
+          val newSummary = oldSummary.getOrElse(new StatCounter())
+          newSummary.merge(newValues)
+          Some(newSummary)
+        })
+    } else {
+      val windowDuration = data.slideDuration * this.windowSize
+      data
+        .groupByKeyAndWindow(windowDuration)
+        .mapValues { values =>
+          val summary = new StatCounter()
+          values.foreach(value => summary.merge(value))
+          summary
+        }
+    }
+  }
+
+  /**
+   * Transform a stream of summaries into pairs representing summary statistics for control group
+   * and experiment group up to this batch.
+   */
+  private[stat] def pairSummaries(summarizedData: DStream[(Boolean, StatCounter)])
+      : DStream[(StatCounter, StatCounter)] = {
+    summarizedData
+      .map[(Int, StatCounter)](x => (0, x._2))
+      .groupByKey()  // should be length two (control/experiment group)
+      .map(x => (x._2.head, x._2.last))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
new file mode 100644
index 0000000000000..a7eaed51b4d55
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import java.io.Serializable
+
+import scala.language.implicitConversions
+import scala.math.pow
+
+import com.twitter.chill.MeatLocker
+import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
+import org.apache.commons.math3.stat.inference.TTest
+
+import org.apache.spark.Logging
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.StatCounter
+
+/**
+ * Significance testing methods for [[StreamingTest]]. New 2-sample statistical significance tests
+ * should extend [[StreamingTestMethod]] and introduce a new entry in
+ * [[StreamingTestMethod.TEST_NAME_TO_OBJECT]]
+ */
+private[stat] sealed trait StreamingTestMethod extends Serializable {
+
+  val methodName: String
+  val nullHypothesis: String
+
+  protected type SummaryPairStream =
+    DStream[(StatCounter, StatCounter)]
+
+  /**
+   * Perform streaming 2-sample statistical significance testing.
+   *
+   * @param sampleSummaries stream pairs of summary statistics for the 2 samples
+   * @return stream of rest results
+   */
+  def doTest(sampleSummaries: SummaryPairStream): DStream[StreamingTestResult]
+
+  /**
+   * Implicit adapter to convert between streaming summary statistics type and the type required by
+   * the t-testing libraries.
+   */
+  protected implicit def toApacheCommonsStats(
+      summaryStats: StatCounter): StatisticalSummaryValues = {
+    new StatisticalSummaryValues(
+      summaryStats.mean,
+      summaryStats.variance,
+      summaryStats.count,
+      summaryStats.max,
+      summaryStats.min,
+      summaryStats.mean * summaryStats.count
+    )
+  }
+}
+
+/**
+ * Performs Welch's 2-sample t-test. The null hypothesis is that the two data sets have equal mean.
+ * This test does not assume equal variance between the two samples and does not assume equal
+ * sample size.
+ *
+ * @see http://en.wikipedia.org/wiki/Welch%27s_t_test
+ */
+private[stat] object WelchTTest extends StreamingTestMethod with Logging {
+
+  override final val methodName = "Welch's 2-sample t-test"
+  override final val nullHypothesis = "Both groups have same mean"
+
+  private final val tTester = MeatLocker(new TTest())
+
+  override def doTest(data: SummaryPairStream): DStream[StreamingTestResult] =
+    data.map[StreamingTestResult]((test _).tupled)
+
+  private def test(
+      statsA: StatCounter,
+      statsB: StatCounter): StreamingTestResult = {
+    def welchDF(sample1: StatisticalSummaryValues, sample2: StatisticalSummaryValues): Double = {
+      val s1 = sample1.getVariance
+      val n1 = sample1.getN
+      val s2 = sample2.getVariance
+      val n2 = sample2.getN
+
+      val a = pow(s1, 2) / n1
+      val b = pow(s2, 2) / n2
+
+      pow(a + b, 2) / ((pow(a, 2) / (n1 - 1)) + (pow(b, 2) / (n2 - 1)))
+    }
+
+    new StreamingTestResult(
+      tTester.get.tTest(statsA, statsB),
+      welchDF(statsA, statsB),
+      tTester.get.t(statsA, statsB),
+      methodName,
+      nullHypothesis
+    )
+  }
+}
+
+/**
+ * Performs Students's 2-sample t-test. The null hypothesis is that the two data sets have equal
+ * mean. This test assumes equal variance between the two samples and does not assume equal sample
+ * size. For unequal variances, Welch's t-test should be used instead.
+ *
+ * @see http://en.wikipedia.org/wiki/Student%27s_t-test
+ */
+private[stat] object StudentTTest extends StreamingTestMethod with Logging {
+
+  override final val methodName = "Student's 2-sample t-test"
+  override final val nullHypothesis = "Both groups have same mean"
+
+  private final val tTester = MeatLocker(new TTest())
+
+  override def doTest(data: SummaryPairStream): DStream[StreamingTestResult] =
+    data.map[StreamingTestResult]((test _).tupled)
+
+  private def test(
+      statsA: StatCounter,
+      statsB: StatCounter): StreamingTestResult = {
+    def studentDF(sample1: StatisticalSummaryValues, sample2: StatisticalSummaryValues): Double =
+      sample1.getN + sample2.getN - 2
+
+    new StreamingTestResult(
+      tTester.get.homoscedasticTTest(statsA, statsB),
+      studentDF(statsA, statsB),
+      tTester.get.homoscedasticT(statsA, statsB),
+      methodName,
+      nullHypothesis
+    )
+  }
+}
+
+/**
+ * Companion object holding supported [[StreamingTestMethod]] names and handles conversion between
+ * strings used in [[StreamingTest]] configuration and actual method implementation.
+ *
+ * Currently supported tests: `welch`, `student`.
+ */
+private[stat] object StreamingTestMethod {
+  // Note: after new `StreamingTestMethod`s are implemented, please update this map.
+  private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map(
+    "welch"->WelchTTest,
+    "student"->StudentTTest)
+
+  def getTestMethodFromName(method: String): StreamingTestMethod =
+    TEST_NAME_TO_OBJECT.get(method) match {
+      case Some(test) => test
+      case None =>
+        throw new IllegalArgumentException(
+          "Unrecognized method name. Supported streaming test methods: "
+            + TEST_NAME_TO_OBJECT.keys.mkString(", "))
+    }
+}
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
index d01b3707be944..b0916d3e84651 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -115,3 +115,25 @@ class KolmogorovSmirnovTestResult private[stat] (
     "Kolmogorov-Smirnov test summary:\n" + super.toString
   }
 }
+
+/**
+ * :: Experimental ::
+ * Object containing the test results for streaming testing.
+ */
+@Experimental
+@Since("1.6.0")
+private[stat] class StreamingTestResult @Since("1.6.0") (
+    @Since("1.6.0") override val pValue: Double,
+    @Since("1.6.0") override val degreesOfFreedom: Double,
+    @Since("1.6.0") override val statistic: Double,
+    @Since("1.6.0") val method: String,
+    @Since("1.6.0") override val nullHypothesis: String)
+  extends TestResult[Double] with Serializable {
+
+  override def toString: String = {
+    "Streaming test summary:\n" +
+      s"method: $method\n" +
+      super.toString
+  }
+}
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
index 8f9eb24b57b55..0abed5411143d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
@@ -166,6 +166,10 @@ private[spark] class NodeIdCache(
         fs.delete(new Path(old.getCheckpointFile.get), true)
       }
     }
+    if (prevNodeIdsForInstances != null) {
+      // Unpersist the previous one if one exists.
+      prevNodeIdsForInstances.unpersist()
+    }
   }
 }
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index 618b95b9bd126..fd22eb6dca018 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -22,6 +22,7 @@
 import java.util.List;
 
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -63,16 +64,16 @@ public void tearDown() {
   @Test
   public void logisticRegressionDefaultParams() {
     LogisticRegression lr = new LogisticRegression();
-    assert(lr.getLabelCol().equals("label"));
+    Assert.assertEquals(lr.getLabelCol(), "label");
     LogisticRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
     DataFrame predictions = jsql.sql("SELECT label, probability, prediction FROM prediction");
     predictions.collectAsList();
     // Check defaults
-    assert(model.getThreshold() == 0.5);
-    assert(model.getFeaturesCol().equals("features"));
-    assert(model.getPredictionCol().equals("prediction"));
-    assert(model.getProbabilityCol().equals("probability"));
+    Assert.assertEquals(0.5, model.getThreshold(), eps);
+    Assert.assertEquals("features", model.getFeaturesCol());
+    Assert.assertEquals("prediction", model.getPredictionCol());
+    Assert.assertEquals("probability", model.getProbabilityCol());
   }
 
   @Test
@@ -85,19 +86,19 @@ public void logisticRegressionWithSetters() {
       .setProbabilityCol("myProbability");
     LogisticRegressionModel model = lr.fit(dataset);
     LogisticRegression parent = (LogisticRegression) model.parent();
-    assert(parent.getMaxIter() == 10);
-    assert(parent.getRegParam() == 1.0);
-    assert(parent.getThresholds()[0] == 0.4);
-    assert(parent.getThresholds()[1] == 0.6);
-    assert(parent.getThreshold() == 0.6);
-    assert(model.getThreshold() == 0.6);
+    Assert.assertEquals(10, parent.getMaxIter());
+    Assert.assertEquals(1.0, parent.getRegParam(), eps);
+    Assert.assertEquals(0.4, parent.getThresholds()[0], eps);
+    Assert.assertEquals(0.6, parent.getThresholds()[1], eps);
+    Assert.assertEquals(0.6, parent.getThreshold(), eps);
+    Assert.assertEquals(0.6, model.getThreshold(), eps);
 
     // Modify model params, and check that the params worked.
     model.setThreshold(1.0);
     model.transform(dataset).registerTempTable("predAllZero");
     DataFrame predAllZero = jsql.sql("SELECT prediction, myProbability FROM predAllZero");
     for (Row r: predAllZero.collectAsList()) {
-      assert(r.getDouble(0) == 0.0);
+      Assert.assertEquals(0.0, r.getDouble(0), eps);
     }
     // Call transform with params, and check that the params worked.
     model.transform(dataset, model.threshold().w(0.0), model.probabilityCol().w("myProb"))
@@ -107,17 +108,17 @@ public void logisticRegressionWithSetters() {
     for (Row r: predNotAllZero.collectAsList()) {
       if (r.getDouble(0) != 0.0) foundNonZero = true;
     }
-    assert(foundNonZero);
+    Assert.assertTrue(foundNonZero);
 
     // Call fit() with new params, and check as many params as we can.
     LogisticRegressionModel model2 = lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1),
         lr.threshold().w(0.4), lr.probabilityCol().w("theProb"));
     LogisticRegression parent2 = (LogisticRegression) model2.parent();
-    assert(parent2.getMaxIter() == 5);
-    assert(parent2.getRegParam() == 0.1);
-    assert(parent2.getThreshold() == 0.4);
-    assert(model2.getThreshold() == 0.4);
-    assert(model2.getProbabilityCol().equals("theProb"));
+    Assert.assertEquals(5, parent2.getMaxIter());
+    Assert.assertEquals(0.1, parent2.getRegParam(), eps);
+    Assert.assertEquals(0.4, parent2.getThreshold(), eps);
+    Assert.assertEquals(0.4, model2.getThreshold(), eps);
+    Assert.assertEquals("theProb", model2.getProbabilityCol());
   }
 
   @SuppressWarnings("unchecked")
@@ -125,18 +126,18 @@ public void logisticRegressionWithSetters() {
   public void logisticRegressionPredictorClassifierMethods() {
     LogisticRegression lr = new LogisticRegression();
     LogisticRegressionModel model = lr.fit(dataset);
-    assert(model.numClasses() == 2);
+    Assert.assertEquals(2, model.numClasses());
 
     model.transform(dataset).registerTempTable("transformed");
     DataFrame trans1 = jsql.sql("SELECT rawPrediction, probability FROM transformed");
     for (Row row: trans1.collect()) {
       Vector raw = (Vector)row.get(0);
       Vector prob = (Vector)row.get(1);
-      assert(raw.size() == 2);
-      assert(prob.size() == 2);
+      Assert.assertEquals(raw.size(), 2);
+      Assert.assertEquals(prob.size(), 2);
       double probFromRaw1 = 1.0 / (1.0 + Math.exp(-raw.apply(1)));
-      assert(Math.abs(prob.apply(1) - probFromRaw1) < eps);
-      assert(Math.abs(prob.apply(0) - (1.0 - probFromRaw1)) < eps);
+      Assert.assertEquals(0, Math.abs(prob.apply(1) - probFromRaw1), eps);
+      Assert.assertEquals(0, Math.abs(prob.apply(0) - (1.0 - probFromRaw1)), eps);
     }
 
     DataFrame trans2 = jsql.sql("SELECT prediction, probability FROM transformed");
@@ -145,7 +146,7 @@ public void logisticRegressionPredictorClassifierMethods() {
       Vector prob = (Vector)row.get(1);
       double probOfPred = prob.apply((int)pred);
       for (int i = 0; i < prob.size(); ++i) {
-        assert(probOfPred >= prob.apply(i));
+        Assert.assertTrue(probOfPred >= prob.apply(i));
       }
     }
   }
@@ -156,6 +157,6 @@ public void logisticRegressionTrainingSummary() {
     LogisticRegressionModel model = lr.fit(dataset);
 
     LogisticRegressionTrainingSummary summary = model.summary();
-    assert(summary.totalIterations() == summary.objectiveHistory().length);
+    Assert.assertEquals(summary.totalIterations(), summary.objectiveHistory().length);
   }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index 8fd7bf55a2e5d..f5f690eabd12c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -19,10 +19,12 @@
 
 import java.io.Serializable;
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
+import static org.junit.Assert.assertEquals;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -58,37 +60,36 @@ public void validatePrediction(DataFrame predictionAndLabels) {
     for (Row r : predictionAndLabels.collect()) {
       double prediction = r.getAs(0);
       double label = r.getAs(1);
-      assert(prediction == label);
+      assertEquals(label, prediction, 1E-5);
     }
   }
 
   @Test
   public void naiveBayesDefaultParams() {
     NaiveBayes nb = new NaiveBayes();
-    assert(nb.getLabelCol() == "label");
-    assert(nb.getFeaturesCol() == "features");
-    assert(nb.getPredictionCol() == "prediction");
-    assert(nb.getSmoothing() == 1.0);
-    assert(nb.getModelType() == "multinomial");
+    assertEquals("label", nb.getLabelCol());
+    assertEquals("features", nb.getFeaturesCol());
+    assertEquals("prediction", nb.getPredictionCol());
+    assertEquals(1.0, nb.getSmoothing(), 1E-5);
+    assertEquals("multinomial", nb.getModelType());
   }
 
   @Test
   public void testNaiveBayes() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)),
       RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)),
       RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)),
       RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)),
-      RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0))
-    ));
+      RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0)));
 
     StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("features", new VectorUDT(), false, Metadata.empty())
     });
 
-    DataFrame dataset = jsql.createDataFrame(jrdd, schema);
+    DataFrame dataset = jsql.createDataFrame(data, schema);
     NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial");
     NaiveBayesModel model = nb.fit(dataset);
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
index 47d68de599da2..8a1e5ef015659 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaBucketizerSuite.java
@@ -55,16 +55,16 @@ public void tearDown() {
   public void bucketizerTest() {
     double[] splits = {-0.5, 0.0, 0.5};
 
-    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-      RowFactory.create(-0.5),
-      RowFactory.create(-0.3),
-      RowFactory.create(0.0),
-      RowFactory.create(0.2)
-    ));
     StructType schema = new StructType(new StructField[] {
       new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
     });
-    DataFrame dataset = jsql.createDataFrame(data, schema);
+    DataFrame dataset = jsql.createDataFrame(
+      Arrays.asList(
+        RowFactory.create(-0.5),
+        RowFactory.create(-0.3),
+        RowFactory.create(0.0),
+        RowFactory.create(0.2)),
+      schema);
 
     Bucketizer bucketizer = new Bucketizer()
       .setInputCol("feature")
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
index 0f6ec64d97d36..39da47381b129 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaDCTSuite.java
@@ -57,12 +57,11 @@ public void tearDown() {
   @Test
   public void javaCompatibilityTest() {
     double[] input = new double[] {1D, 2D, 3D, 4D};
-    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-      RowFactory.create(Vectors.dense(input))
-    ));
-    DataFrame dataset = jsql.createDataFrame(data, new StructType(new StructField[]{
-      new StructField("vec", (new VectorUDT()), false, Metadata.empty())
-    }));
+    DataFrame dataset = jsql.createDataFrame(
+      Arrays.asList(RowFactory.create(Vectors.dense(input))),
+      new StructType(new StructField[]{
+        new StructField("vec", (new VectorUDT()), false, Metadata.empty())
+      }));
 
     double[] expectedResult = input.clone();
     (new DoubleDCT_1D(input.length)).forward(expectedResult, true);
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
index 03dd5369bddf7..d12332c2a02a3 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaHashingTFSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -55,17 +56,17 @@ public void tearDown() {
 
   @Test
   public void hashingTF() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(0.0, "Hi I heard about Spark"),
       RowFactory.create(0.0, "I wish Java could use case classes"),
       RowFactory.create(1.0, "Logistic regression models are neat")
-    ));
+    );
     StructType schema = new StructType(new StructField[]{
       new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
       new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
     });
 
-    DataFrame sentenceData = jsql.createDataFrame(jrdd, schema);
+    DataFrame sentenceData = jsql.createDataFrame(data, schema);
     Tokenizer tokenizer = new Tokenizer()
       .setInputCol("sentence")
       .setOutputCol("words");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
index 834fedbb59e1b..bf8eefd71905c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPolynomialExpansionSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -60,7 +61,7 @@ public void polynomialExpansionTest() {
       .setOutputCol("polyFeatures")
       .setDegree(3);
 
-    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(
         Vectors.dense(-2.0, 2.3),
         Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)
@@ -70,7 +71,7 @@ public void polynomialExpansionTest() {
         Vectors.dense(0.6, -1.1),
         Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331)
       )
-    ));
+    );
 
     StructType schema = new StructType(new StructField[] {
       new StructField("features", new VectorUDT(), false, Metadata.empty()),
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
index 76cdd0fae84ab..848d9f8aa9288 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
@@ -58,14 +59,14 @@ public void javaCompatibilityTest() {
       .setInputCol("raw")
       .setOutputCol("filtered");
 
-    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
       RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
-    ));
+    );
     StructType schema = new StructType(new StructField[] {
       new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
     });
-    DataFrame dataset = jsql.createDataFrame(rdd, schema);
+    DataFrame dataset = jsql.createDataFrame(data, schema);
 
     remover.transform(dataset).collect();
   }
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
index 35b18c5308f61..6b2c48ef1c342 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStringIndexerSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -56,9 +57,9 @@ public void testStringIndexer() {
       createStructField("id", IntegerType, false),
       createStructField("label", StringType, false)
     });
-    JavaRDD<Row> rdd = jsc.parallelize(
-      Arrays.asList(c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c")));
-    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+    List<Row> data = Arrays.asList(
+      c(0, "a"), c(1, "b"), c(2, "c"), c(3, "a"), c(4, "a"), c(5, "c"));
+    DataFrame dataset = sqlContext.createDataFrame(data, schema);
 
     StringIndexer indexer = new StringIndexer()
       .setInputCol("label")
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
index b7c564caad3bd..e283777570930 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorAssemblerSuite.java
@@ -65,8 +65,7 @@ public void testVectorAssembler() {
     Row row = RowFactory.create(
       0, 0.0, Vectors.dense(1.0, 2.0), "a",
       Vectors.sparse(2, new int[] {1}, new double[] {3.0}), 10L);
-    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
-    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+    DataFrame dataset = sqlContext.createDataFrame(Arrays.asList(row), schema);
     VectorAssembler assembler = new VectorAssembler()
       .setInputCols(new String[] {"x", "y", "z", "n"})
       .setOutputCol("features");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
index f953361427586..00174e6a683d6 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaVectorSlicerSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature;
 
 import java.util.Arrays;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -63,12 +64,12 @@ public void vectorSlice() {
     };
     AttributeGroup group = new AttributeGroup("userFeatures", attrs);
 
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+    List<Row> data = Arrays.asList(
       RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
       RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
-    ));
+    );
 
-    DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+    DataFrame dataset = jsql.createDataFrame(data, (new StructType()).add(group.toStructField()));
 
     VectorSlicer vectorSlicer = new VectorSlicer()
       .setInputCol("userFeatures").setOutputCol("features");
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
index 70f5ad9432212..0c0c1c4d12d0f 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaWord2VecSuite.java
@@ -51,15 +51,15 @@ public void tearDown() {
 
   @Test
   public void testJavaWord2Vec() {
-    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-      RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
-      RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
-      RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
-    ));
     StructType schema = new StructType(new StructField[]{
       new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
     });
-    DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+    DataFrame documentDF = sqlContext.createDataFrame(
+      Arrays.asList(
+        RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+        RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+        RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))),
+      schema);
 
     Word2Vec word2Vec = new Word2Vec()
       .setInputCol("text")
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index d591a456864e4..91c589d00abd5 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -60,7 +60,7 @@ public void tearDown() {
   @Test
   public void linearRegressionDefaultParams() {
     LinearRegression lr = new LinearRegression();
-    assert(lr.getLabelCol().equals("label"));
+    assertEquals("label", lr.getLabelCol());
     LinearRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
     DataFrame predictions = jsql.sql("SELECT label, prediction FROM prediction");
diff --git a/mllib/src/test/java/org/apache/spark/ml/source/libsvm/JavaLibSVMRelationSuite.java b/mllib/src/test/java/org/apache/spark/ml/source/libsvm/JavaLibSVMRelationSuite.java
new file mode 100644
index 0000000000000..2976b38e45031
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/source/libsvm/JavaLibSVMRelationSuite.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source.libsvm;
+
+import java.io.File;
+import java.io.IOException;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.DenseVector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.util.Utils;
+
+
+/**
+ * Test LibSVMRelation in Java.
+ */
+public class JavaLibSVMRelationSuite {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext sqlContext;
+
+  private File tempDir;
+  private String path;
+
+  @Before
+  public void setUp() throws IOException {
+    jsc = new JavaSparkContext("local", "JavaLibSVMRelationSuite");
+    sqlContext = new SQLContext(jsc);
+
+    tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource");
+    File file = new File(tempDir, "part-00000");
+    String s = "1 1:1.0 3:2.0 5:3.0\n0\n0 2:4.0 4:5.0 6:6.0";
+    Files.write(s, file, Charsets.US_ASCII);
+    path = tempDir.toURI().toString();
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+    Utils.deleteRecursively(tempDir);
+  }
+
+  @Test
+  public void verifyLibSVMDF() {
+    DataFrame dataset = sqlContext.read().format("libsvm").option("vectorType", "dense")
+      .load(path);
+    Assert.assertEquals("label", dataset.columns()[0]);
+    Assert.assertEquals("features", dataset.columns()[1]);
+    Row r = dataset.first();
+    Assert.assertEquals(1.0, r.getDouble(0), 1e-15);
+    DenseVector v = r.getAs(1);
+    Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v);
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
index 3349c5022423a..8beea102efd01 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
@@ -80,10 +80,10 @@ public void diagonalMatrixConstruction() {
         assertArrayEquals(sd.toArray(), s.toArray(), 0.0);
         assertArrayEquals(s.toArray(), ss.toArray(), 0.0);
         assertArrayEquals(s.values(), ss.values(), 0.0);
-        assert(s.values().length == 2);
-        assert(ss.values().length == 2);
-        assert(s.colPtrs().length == 4);
-        assert(ss.colPtrs().length == 4);
+        assertEquals(2, s.values().length);
+        assertEquals(2, ss.values().length);
+        assertEquals(4, s.colPtrs().length);
+        assertEquals(4, ss.colPtrs().length);
     }
 
     @Test
@@ -137,27 +137,27 @@ public void concatenateMatrices() {
         Matrix deHorz2 = Matrices.horzcat(new Matrix[]{spMat1, deMat2});
         Matrix deHorz3 = Matrices.horzcat(new Matrix[]{deMat1, spMat2});
 
-        assert(deHorz1.numRows() == 3);
-        assert(deHorz2.numRows() == 3);
-        assert(deHorz3.numRows() == 3);
-        assert(spHorz.numRows() == 3);
-        assert(deHorz1.numCols() == 5);
-        assert(deHorz2.numCols() == 5);
-        assert(deHorz3.numCols() == 5);
-        assert(spHorz.numCols() == 5);
+        assertEquals(3, deHorz1.numRows());
+        assertEquals(3, deHorz2.numRows());
+        assertEquals(3, deHorz3.numRows());
+        assertEquals(3, spHorz.numRows());
+        assertEquals(5, deHorz1.numCols());
+        assertEquals(5, deHorz2.numCols());
+        assertEquals(5, deHorz3.numCols());
+        assertEquals(5, spHorz.numCols());
 
         Matrix spVert = Matrices.vertcat(new Matrix[]{spMat1, spMat3});
         Matrix deVert1 = Matrices.vertcat(new Matrix[]{deMat1, deMat3});
         Matrix deVert2 = Matrices.vertcat(new Matrix[]{spMat1, deMat3});
         Matrix deVert3 = Matrices.vertcat(new Matrix[]{deMat1, spMat3});
 
-        assert(deVert1.numRows() == 5);
-        assert(deVert2.numRows() == 5);
-        assert(deVert3.numRows() == 5);
-        assert(spVert.numRows() == 5);
-        assert(deVert1.numCols() == 2);
-        assert(deVert2.numCols() == 2);
-        assert(deVert3.numCols() == 2);
-        assert(spVert.numCols() == 2);
+        assertEquals(5, deVert1.numRows());
+        assertEquals(5, deVert2.numRows());
+        assertEquals(5, deVert3.numRows());
+        assertEquals(5, spVert.numRows());
+        assertEquals(2, deVert1.numCols());
+        assertEquals(2, deVert2.numCols());
+        assertEquals(2, deVert3.numCols());
+        assertEquals(2, spVert.numCols());
     }
 }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
index 33d81b1e9592b..5728df5aeebdc 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/random/JavaRandomRDDsSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.random;
 
+import java.io.Serializable;
 import java.util.Arrays;
 
 import org.apache.spark.api.java.JavaRDD;
@@ -231,4 +232,50 @@ public void testGammaVectorRDD() {
     }
   }
 
+  @Test
+  public void testArbitrary() {
+    long size = 10;
+    long seed = 1L;
+    int numPartitions = 0;
+    StringGenerator gen = new StringGenerator();
+    JavaRDD<String> rdd1 = randomJavaRDD(sc, gen, size);
+    JavaRDD<String> rdd2 = randomJavaRDD(sc, gen, size, numPartitions);
+    JavaRDD<String> rdd3 = randomJavaRDD(sc, gen, size, numPartitions, seed);
+    for (JavaRDD<String> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(size, rdd.count());
+      Assert.assertEquals(2, rdd.first().length());
+    }
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testRandomVectorRDD() {
+    UniformGenerator generator = new UniformGenerator();
+    long m = 100L;
+    int n = 10;
+    int p = 2;
+    long seed = 1L;
+    JavaRDD<Vector> rdd1 = randomJavaVectorRDD(sc, generator, m, n);
+    JavaRDD<Vector> rdd2 = randomJavaVectorRDD(sc, generator, m, n, p);
+    JavaRDD<Vector> rdd3 = randomJavaVectorRDD(sc, generator, m, n, p, seed);
+    for (JavaRDD<Vector> rdd: Arrays.asList(rdd1, rdd2, rdd3)) {
+      Assert.assertEquals(m, rdd.count());
+      Assert.assertEquals(n, rdd.first().size());
+    }
+  }
+}
+
+// This is just a test generator, it always returns a string of 42
+class StringGenerator implements RandomDataGenerator<String>, Serializable {
+  @Override
+  public String nextValue() {
+    return "42";
+  }
+  @Override
+  public StringGenerator copy() {
+    return new StringGenerator();
+  }
+  @Override
+  public void setSeed(long seed) {
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index f680d8d3c4cc2..815f6fd997584 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -59,7 +59,7 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
 
   test("params") {
     ParamsSuite.checkParams(new DecisionTreeClassifier)
-    val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)
+    val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2)
     ParamsSuite.checkParams(model)
   }
 
@@ -310,6 +310,7 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
       dt: DecisionTreeClassifier,
       categoricalFeatures: Map[Int, Int],
       numClasses: Int): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy = dt.getOldStrategy(categoricalFeatures, numClasses)
     val oldTree = OldDecisionTree.train(data, oldStrategy)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses)
@@ -318,5 +319,6 @@ private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
     val oldTreeAsNew = DecisionTreeClassificationModel.fromOld(
       oldTree, newTree.parent.asInstanceOf[DecisionTreeClassifier], categoricalFeatures)
     TreeTests.checkEqual(oldTreeAsNew, newTree)
+    assert(newTree.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index e3909bccaa5ca..039141aeb6f67 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -59,8 +59,8 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("params") {
     ParamsSuite.checkParams(new GBTClassifier)
     val model = new GBTClassificationModel("gbtc",
-      Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null))),
-      Array(1.0))
+      Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null), 1)),
+      Array(1.0), 1)
     ParamsSuite.checkParams(model)
   }
 
@@ -145,7 +145,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
   */
 }
 
-private object GBTClassifierSuite {
+private object GBTClassifierSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
@@ -156,6 +156,7 @@ private object GBTClassifierSuite {
       validationData: Option[RDD[LabeledPoint]],
       gbt: GBTClassifier,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldBoostingStrategy =
       gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
     val oldGBT = new OldGBT(oldBoostingStrategy)
@@ -164,7 +165,9 @@ private object GBTClassifierSuite {
     val newModel = gbt.fit(newData)
     // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = GBTClassificationModel.fromOld(
-      oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures)
+      oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures, numFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.numFeatures === numFeatures)
+    assert(oldModelAsNew.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index cce39f382f738..ec01998601fbc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.ml.classification
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -59,8 +62,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
       val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
 
-      sqlContext.createDataFrame(
-        generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42))
+      sqlContext.createDataFrame(sc.parallelize(testData, 4))
     }
   }
 
@@ -77,6 +79,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lr.getPredictionCol === "prediction")
     assert(lr.getRawPredictionCol === "rawPrediction")
     assert(lr.getProbabilityCol === "probability")
+    assert(lr.getWeightCol === "")
     assert(lr.getFitIntercept)
     assert(lr.getStandardization)
     val model = lr.fit(dataset)
@@ -191,6 +194,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val model = lr.fit(dataset)
     assert(model.numClasses === 2)
+    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    assert(model.numFeatures === numFeatures)
 
     val threshold = model.getThreshold
     val results = model.transform(dataset)
@@ -216,43 +221,65 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("MultiClassSummarizer") {
     val summarizer1 = (new MultiClassSummarizer)
       .add(0.0).add(3.0).add(4.0).add(3.0).add(6.0)
-    assert(summarizer1.histogram.zip(Array[Long](1, 0, 0, 2, 1, 0, 1)).forall(x => x._1 === x._2))
+    assert(summarizer1.histogram === Array[Double](1, 0, 0, 2, 1, 0, 1))
     assert(summarizer1.countInvalid === 0)
     assert(summarizer1.numClasses === 7)
 
     val summarizer2 = (new MultiClassSummarizer)
       .add(1.0).add(5.0).add(3.0).add(0.0).add(4.0).add(1.0)
-    assert(summarizer2.histogram.zip(Array[Long](1, 2, 0, 1, 1, 1)).forall(x => x._1 === x._2))
+    assert(summarizer2.histogram === Array[Double](1, 2, 0, 1, 1, 1))
     assert(summarizer2.countInvalid === 0)
     assert(summarizer2.numClasses === 6)
 
     val summarizer3 = (new MultiClassSummarizer)
       .add(0.0).add(1.3).add(5.2).add(2.5).add(2.0).add(4.0).add(4.0).add(4.0).add(1.0)
-    assert(summarizer3.histogram.zip(Array[Long](1, 1, 1, 0, 3)).forall(x => x._1 === x._2))
+    assert(summarizer3.histogram === Array[Double](1, 1, 1, 0, 3))
     assert(summarizer3.countInvalid === 3)
     assert(summarizer3.numClasses === 5)
 
     val summarizer4 = (new MultiClassSummarizer)
       .add(3.1).add(4.3).add(2.0).add(1.0).add(3.0)
-    assert(summarizer4.histogram.zip(Array[Long](0, 1, 1, 1)).forall(x => x._1 === x._2))
+    assert(summarizer4.histogram === Array[Double](0, 1, 1, 1))
     assert(summarizer4.countInvalid === 2)
     assert(summarizer4.numClasses === 4)
 
     // small map merges large one
     val summarizerA = summarizer1.merge(summarizer2)
     assert(summarizerA.hashCode() === summarizer2.hashCode())
-    assert(summarizerA.histogram.zip(Array[Long](2, 2, 0, 3, 2, 1, 1)).forall(x => x._1 === x._2))
+    assert(summarizerA.histogram === Array[Double](2, 2, 0, 3, 2, 1, 1))
     assert(summarizerA.countInvalid === 0)
     assert(summarizerA.numClasses === 7)
 
     // large map merges small one
     val summarizerB = summarizer3.merge(summarizer4)
     assert(summarizerB.hashCode() === summarizer3.hashCode())
-    assert(summarizerB.histogram.zip(Array[Long](1, 2, 2, 1, 3)).forall(x => x._1 === x._2))
+    assert(summarizerB.histogram === Array[Double](1, 2, 2, 1, 3))
     assert(summarizerB.countInvalid === 5)
     assert(summarizerB.numClasses === 5)
   }
 
+  test("MultiClassSummarizer with weighted samples") {
+    val summarizer1 = (new MultiClassSummarizer)
+      .add(label = 0.0, weight = 0.2).add(3.0, 0.8).add(4.0, 3.2).add(3.0, 1.3).add(6.0, 3.1)
+    assert(Vectors.dense(summarizer1.histogram) ~==
+      Vectors.dense(Array(0.2, 0, 0, 2.1, 3.2, 0, 3.1)) absTol 1E-10)
+    assert(summarizer1.countInvalid === 0)
+    assert(summarizer1.numClasses === 7)
+
+    val summarizer2 = (new MultiClassSummarizer)
+      .add(1.0, 1.1).add(5.0, 2.3).add(3.0).add(0.0).add(4.0).add(1.0).add(2, 0.0)
+    assert(Vectors.dense(summarizer2.histogram) ~==
+      Vectors.dense(Array[Double](1.0, 2.1, 0.0, 1, 1, 2.3)) absTol 1E-10)
+    assert(summarizer2.countInvalid === 0)
+    assert(summarizer2.numClasses === 6)
+
+    val summarizer = summarizer1.merge(summarizer2)
+    assert(Vectors.dense(summarizer.histogram) ~==
+      Vectors.dense(Array(1.2, 2.1, 0.0, 3.1, 4.2, 2.3, 3.1)) absTol 1E-10)
+    assert(summarizer.countInvalid === 0)
+    assert(summarizer.numClasses === 7)
+  }
+
   test("binary logistic regression with intercept without regularization") {
     val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true)
     val trainer2 = (new LogisticRegression).setFitIntercept(true).setStandardization(false)
@@ -713,7 +740,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        b = \log{P(1) / P(0)} = \log{count_1 / count_0}
        }}}
      */
-    val interceptTheory = math.log(histogram(1).toDouble / histogram(0).toDouble)
+    val interceptTheory = math.log(histogram(1) / histogram(0))
     val weightsTheory = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptTheory relTol 1E-5)
@@ -781,4 +808,63 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .forall(x => x(0) >= x(1)))
 
   }
+
+  test("binary logistic regression with weighted samples") {
+    val (dataset, weightedDataset) = {
+      val nPoints = 1000
+      val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+      val xMean = Array(5.843, 3.057, 3.758, 1.199)
+      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+      val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)
+
+      // Let's over-sample the positive samples twice.
+      val data1 = testData.flatMap { case labeledPoint: LabeledPoint =>
+        if (labeledPoint.label == 1.0) {
+          Iterator(labeledPoint, labeledPoint)
+        } else {
+          Iterator(labeledPoint)
+        }
+      }
+
+      val rnd = new Random(8392)
+      val data2 = testData.flatMap { case LabeledPoint(label: Double, features: Vector) =>
+        if (rnd.nextGaussian() > 0.0) {
+          if (label == 1.0) {
+            Iterator(
+              Instance(label, 1.2, features),
+              Instance(label, 0.8, features),
+              Instance(0.0, 0.0, features))
+          } else {
+            Iterator(
+              Instance(label, 0.3, features),
+              Instance(1.0, 0.0, features),
+              Instance(label, 0.1, features),
+              Instance(label, 0.6, features))
+          }
+        } else {
+          if (label == 1.0) {
+            Iterator(Instance(label, 2.0, features))
+          } else {
+            Iterator(Instance(label, 1.0, features))
+          }
+        }
+      }
+
+      (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
+        sqlContext.createDataFrame(sc.parallelize(data2, 4)))
+    }
+
+    val trainer1a = (new LogisticRegression).setFitIntercept(true)
+      .setRegParam(0.0).setStandardization(true)
+    val trainer1b = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
+      .setRegParam(0.0).setStandardization(true)
+    val model1a0 = trainer1a.fit(dataset)
+    val model1a1 = trainer1a.fit(weightedDataset)
+    val model1b = trainer1b.fit(weightedDataset)
+    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+    assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
+    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+    assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index ddc948f65df45..2d1df9b2b82e8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
@@ -73,6 +73,8 @@ class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSp
       .setSeed(11L)
       .setMaxIter(numIterations)
     val model = trainer.fit(dataFrame)
+    val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size
+    assert(model.numFeatures === numFeatures)
     val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label")
       .map { case Row(p: Double, l: Double) => (p, l) }
     // train multinomial logistic regression
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
index 8f50cb924e64d..fb5f00e0646c6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 final class TestProbabilisticClassificationModel(
     override val uid: String,
+    override val numFeatures: Int,
     override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] {
 
@@ -45,13 +46,14 @@ class ProbabilisticClassifierSuite extends SparkFunSuite {
 
   test("test thresholding") {
     val thresholds = Array(0.5, 0.2)
-    val testModel = new TestProbabilisticClassificationModel("myuid", 2).setThresholds(thresholds)
+    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
+      .setThresholds(thresholds)
     assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 1.0))) === 1.0)
     assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 0.2))) === 0.0)
   }
 
   test("test thresholding not required") {
-    val testModel = new TestProbabilisticClassificationModel("myuid", 2)
+    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
     assert(testModel.friendlyPredict(Vectors.dense(Array(1.0, 2.0))) === 1.0)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index b4403ec30049a..deb8ec771cb27 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -68,7 +68,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
   test("params") {
     ParamsSuite.checkParams(new RandomForestClassifier)
     val model = new RandomForestClassificationModel("rfc",
-      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)), 2, 2)
+      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2)), 2, 2)
     ParamsSuite.checkParams(model)
   }
 
@@ -209,7 +209,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
   */
 }
 
-private object RandomForestClassifierSuite {
+private object RandomForestClassifierSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
@@ -220,6 +220,7 @@ private object RandomForestClassifierSuite {
       rf: RandomForestClassifier,
       categoricalFeatures: Map[Int, Int],
       numClasses: Int): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy =
       rf.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, rf.getOldImpurity)
     val oldModel = OldRandomForest.trainClassifier(
@@ -233,6 +234,7 @@ private object RandomForestClassifierSuite {
     TreeTests.checkEqual(oldModelAsNew, newModel)
     assert(newModel.hasParent)
     assert(!newModel.trees.head.asInstanceOf[DecisionTreeClassificationModel].hasParent)
-    assert(newModel.numClasses == numClasses)
+    assert(newModel.numClasses === numClasses)
+    assert(newModel.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
new file mode 100644
index 0000000000000..2beb62ca08233
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/InteractionSuite.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable.ArrayBuilder
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.functions.col
+
+class InteractionSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("params") {
+    ParamsSuite.checkParams(new Interaction())
+  }
+
+  test("feature encoder") {
+    def encode(cardinalities: Array[Int], value: Any): Vector = {
+      var indices = ArrayBuilder.make[Int]
+      var values = ArrayBuilder.make[Double]
+      val encoder = new FeatureEncoder(cardinalities)
+      encoder.foreachNonzeroOutput(value, (i, v) => {
+        indices += i
+        values += v
+      })
+      Vectors.sparse(encoder.outputSize, indices.result(), values.result()).compressed
+    }
+    assert(encode(Array(1), 2.2) === Vectors.dense(2.2))
+    assert(encode(Array(3), Vectors.dense(1)) === Vectors.dense(0, 1, 0))
+    assert(encode(Array(1, 1), Vectors.dense(1.1, 2.2)) === Vectors.dense(1.1, 2.2))
+    assert(encode(Array(3, 1), Vectors.dense(1, 2.2)) === Vectors.dense(0, 1, 0, 2.2))
+    assert(encode(Array(2, 1), Vectors.dense(1, 2.2)) === Vectors.dense(0, 1, 2.2))
+    assert(encode(Array(2, 1, 1), Vectors.dense(0, 2.2, 0)) === Vectors.dense(1, 0, 2.2, 0))
+    intercept[SparkException] { encode(Array(1), "foo") }
+    intercept[SparkException] { encode(Array(1), null) }
+    intercept[AssertionError] { encode(Array(2), 2.2) }
+    intercept[AssertionError] { encode(Array(3), Vectors.dense(2.2)) }
+    intercept[AssertionError] { encode(Array(1), Vectors.dense(1.0, 2.0, 3.0)) }
+    intercept[AssertionError] { encode(Array(3), Vectors.dense(-1)) }
+    intercept[AssertionError] { encode(Array(3), Vectors.dense(3)) }
+  }
+
+  test("numeric interaction") {
+    val data = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0)),
+        (1, Vectors.dense(1.0, 5.0)))
+      ).toDF("a", "b")
+    val groupAttr = new AttributeGroup(
+      "b",
+      Array[Attribute](
+        NumericAttribute.defaultAttr.withName("foo"),
+        NumericAttribute.defaultAttr.withName("bar")))
+    val df = data.select(
+      col("a").as("a", NumericAttribute.defaultAttr.toMetadata()),
+      col("b").as("b", groupAttr.toMetadata()))
+    val trans = new Interaction().setInputCols(Array("a", "b")).setOutputCol("features")
+    val res = trans.transform(df)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0), Vectors.dense(6.0, 8.0)),
+        (1, Vectors.dense(1.0, 5.0), Vectors.dense(1.0, 5.0)))
+      ).toDF("a", "b", "features")
+    assert(res.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(res.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a:b_foo"), Some(1)),
+        new NumericAttribute(Some("a:b_bar"), Some(2))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("nominal interaction") {
+    val data = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0)),
+        (1, Vectors.dense(1.0, 5.0)))
+      ).toDF("a", "b")
+    val groupAttr = new AttributeGroup(
+      "b",
+      Array[Attribute](
+        NumericAttribute.defaultAttr.withName("foo"),
+        NumericAttribute.defaultAttr.withName("bar")))
+    val df = data.select(
+      col("a").as(
+        "a", NominalAttribute.defaultAttr.withValues(Array("up", "down", "left")).toMetadata()),
+      col("b").as("b", groupAttr.toMetadata()))
+    val trans = new Interaction().setInputCols(Array("a", "b")).setOutputCol("features")
+    val res = trans.transform(df)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(3.0, 4.0), Vectors.dense(0, 0, 0, 0, 3, 4)),
+        (1, Vectors.dense(1.0, 5.0), Vectors.dense(0, 0, 1, 5, 0, 0)))
+      ).toDF("a", "b", "features")
+    assert(res.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(res.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_up:b_foo"), Some(1)),
+        new NumericAttribute(Some("a_up:b_bar"), Some(2)),
+        new NumericAttribute(Some("a_down:b_foo"), Some(3)),
+        new NumericAttribute(Some("a_down:b_bar"), Some(4)),
+        new NumericAttribute(Some("a_left:b_foo"), Some(5)),
+        new NumericAttribute(Some("a_left:b_bar"), Some(6))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("default attr names") {
+    val data = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(0.0, 4.0), 1.0),
+        (1, Vectors.dense(1.0, 5.0), 10.0))
+      ).toDF("a", "b", "c")
+    val groupAttr = new AttributeGroup(
+      "b",
+      Array[Attribute](
+        NominalAttribute.defaultAttr.withNumValues(2),
+        NumericAttribute.defaultAttr))
+    val df = data.select(
+      col("a").as("a", NominalAttribute.defaultAttr.withNumValues(3).toMetadata()),
+      col("b").as("b", groupAttr.toMetadata()),
+      col("c").as("c", NumericAttribute.defaultAttr.toMetadata()))
+    val trans = new Interaction().setInputCols(Array("a", "b", "c")).setOutputCol("features")
+    val res = trans.transform(df)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (2, Vectors.dense(0.0, 4.0), 1.0, Vectors.dense(0, 0, 0, 0, 0, 0, 1, 0, 4)),
+        (1, Vectors.dense(1.0, 5.0), 10.0, Vectors.dense(0, 0, 0, 0, 10, 50, 0, 0, 0)))
+      ).toDF("a", "b", "c", "features")
+    assert(res.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(res.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_0:b_0_0:c"), Some(1)),
+        new NumericAttribute(Some("a_0:b_0_1:c"), Some(2)),
+        new NumericAttribute(Some("a_0:b_1:c"), Some(3)),
+        new NumericAttribute(Some("a_1:b_0_0:c"), Some(4)),
+        new NumericAttribute(Some("a_1:b_0_1:c"), Some(5)),
+        new NumericAttribute(Some("a_1:b_1:c"), Some(6)),
+        new NumericAttribute(Some("a_2:b_0_0:c"), Some(7)),
+        new NumericAttribute(Some("a_2:b_0_1:c"), Some(8)),
+        new NumericAttribute(Some("a_2:b_1:c"), Some(9))))
+    assert(attrs === expectedAttrs)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
index 436e66bab09b0..53798c659d4f3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -25,16 +25,24 @@ class RFormulaParserSuite extends SparkFunSuite {
       formula: String,
       label: String,
       terms: Seq[String],
-      schema: StructType = null) {
+      schema: StructType = new StructType) {
     val resolved = RFormulaParser.parse(formula).resolve(schema)
     assert(resolved.label == label)
-    assert(resolved.terms == terms)
+    val simpleTerms = terms.map { t =>
+      if (t.contains(":")) {
+        t.split(":").toSeq
+      } else {
+        Seq(t)
+      }
+    }
+    assert(resolved.terms == simpleTerms)
   }
 
   test("parse simple formulas") {
     checkParse("y ~ x", "y", Seq("x"))
     checkParse("y ~ x + x", "y", Seq("x"))
-    checkParse("y ~   ._foo  ", "y", Seq("._foo"))
+    checkParse("y~x+z", "y", Seq("x", "z"))
+    checkParse("y ~   ._fo..o  ", "y", Seq("._fo..o"))
     checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
   }
 
@@ -79,4 +87,79 @@ class RFormulaParserSuite extends SparkFunSuite {
     assert(!RFormulaParser.parse("a ~ b - 1").hasIntercept)
     assert(!RFormulaParser.parse("a ~ b + 1 - 1").hasIntercept)
   }
+
+  test("parse interactions") {
+    checkParse("y ~ a:b", "y", Seq("a:b"))
+    checkParse("y ~ ._a:._x", "y", Seq("._a:._x"))
+    checkParse("y ~ foo:bar", "y", Seq("foo:bar"))
+    checkParse("y ~ a : b : c", "y", Seq("a:b:c"))
+    checkParse("y ~ q + a:b:c + b:c + c:d + z", "y", Seq("q", "a:b:c", "b:c", "c:d", "z"))
+  }
+
+  test("parse basic interactions with dot") {
+    val schema = (new StructType)
+      .add("a", "int", true)
+      .add("b", "long", false)
+      .add("c", "string", true)
+      .add("d", "string", true)
+    checkParse("a ~ .:b", "a", Seq("b", "c:b", "d:b"), schema)
+    checkParse("a ~ b:.", "a", Seq("b", "b:c", "b:d"), schema)
+    checkParse("a ~ .:b:.:.:c:d:.", "a", Seq("b:c:d"), schema)
+  }
+
+  // Test data generated in R with terms.formula(y ~ .:., data = iris)
+  test("parse all to all iris interactions") {
+    val schema = (new StructType)
+      .add("Sepal.Length", "double", true)
+      .add("Sepal.Width", "double", true)
+      .add("Petal.Length", "double", true)
+      .add("Petal.Width", "double", true)
+      .add("Species", "string", true)
+    checkParse(
+      "y ~ .:.",
+      "y",
+      Seq(
+        "Sepal.Length",
+        "Sepal.Width",
+        "Petal.Length",
+        "Petal.Width",
+        "Species",
+        "Sepal.Length:Sepal.Width",
+        "Sepal.Length:Petal.Length",
+        "Sepal.Length:Petal.Width",
+        "Sepal.Length:Species",
+        "Sepal.Width:Petal.Length",
+        "Sepal.Width:Petal.Width",
+        "Sepal.Width:Species",
+        "Petal.Length:Petal.Width",
+        "Petal.Length:Species",
+        "Petal.Width:Species"),
+      schema)
+  }
+
+  // Test data generated in R with terms.formula(y ~ .:. - Species:., data = iris)
+  test("parse interaction negation with iris") {
+    val schema = (new StructType)
+      .add("Sepal.Length", "double", true)
+      .add("Sepal.Width", "double", true)
+      .add("Petal.Length", "double", true)
+      .add("Petal.Width", "double", true)
+      .add("Species", "string", true)
+    checkParse("y ~ .:. - .:.", "y", Nil, schema)
+    checkParse(
+      "y ~ .:. - Species:.",
+      "y",
+      Seq(
+        "Sepal.Length",
+        "Sepal.Width",
+        "Petal.Length",
+        "Petal.Width",
+        "Sepal.Length:Sepal.Width",
+        "Sepal.Length:Petal.Length",
+        "Sepal.Length:Petal.Width",
+        "Sepal.Width:Petal.Length",
+        "Sepal.Width:Petal.Width",
+        "Petal.Length:Petal.Width"),
+      schema)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 6aed3243afce8..b56013008b116 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -118,9 +118,81 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     val expectedAttrs = new AttributeGroup(
       "features",
       Array(
-        new BinaryAttribute(Some("a__bar"), Some(1)),
-        new BinaryAttribute(Some("a__foo"), Some(2)),
+        new BinaryAttribute(Some("a_bar"), Some(1)),
+        new BinaryAttribute(Some("a_foo"), Some(2)),
         new NumericAttribute(Some("b"), Some(3))))
     assert(attrs === expectedAttrs)
   }
+
+  test("numeric interaction") {
+    val formula = new RFormula().setFormula("a ~ b:c:d")
+    val original = sqlContext.createDataFrame(
+      Seq((1, 2, 4, 2), (2, 3, 4, 1))
+    ).toDF("a", "b", "c", "d")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, 2, 4, 2, Vectors.dense(16.0), 1.0),
+        (2, 3, 4, 1, Vectors.dense(12.0), 2.0))
+      ).toDF("a", "b", "c", "d", "features", "label")
+    assert(result.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](new NumericAttribute(Some("b:c:d"), Some(1))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("factor numeric interaction") {
+    val formula = new RFormula().setFormula("id ~ a:b")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5), (4, "baz", 5), (4, "baz", 5))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 4.0, 0.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 5.0, 0.0), 3.0),
+        (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
+        (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
+        (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0))
+      ).toDF("id", "a", "b", "features", "label")
+    assert(result.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_baz:b"), Some(1)),
+        new NumericAttribute(Some("a_bar:b"), Some(2)),
+        new NumericAttribute(Some("a_foo:b"), Some(3))))
+    assert(attrs === expectedAttrs)
+  }
+
+  test("factor factor interaction") {
+    val formula = new RFormula().setFormula("id ~ a:b")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zz"))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, "foo", "zq", Vectors.dense(0.0, 0.0, 1.0, 0.0), 1.0),
+        (2, "bar", "zq", Vectors.dense(1.0, 0.0, 0.0, 0.0), 2.0),
+        (3, "bar", "zz", Vectors.dense(0.0, 1.0, 0.0, 0.0), 3.0))
+      ).toDF("id", "a", "b", "features", "label")
+    assert(result.collect() === expected.collect())
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_bar:b_zq"), Some(1)),
+        new NumericAttribute(Some("a_bar:b_zz"), Some(2)),
+        new NumericAttribute(Some("a_foo:b_zq"), Some(3)),
+        new NumericAttribute(Some("a_foo:b_zz"), Some(4))))
+    assert(attrs === expectedAttrs)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index f01306f89cb5f..e0d433f566c25 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -65,7 +65,7 @@ class StopWordsRemoverSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("StopWordsRemover with additional words") {
-    val stopWords = StopWords.EnglishStopWords ++ Array("python", "scala")
+    val stopWords = StopWords.English ++ Array("python", "scala")
     val remover = new StopWordsRemover()
       .setInputCol("raw")
       .setOutputCol("filtered")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 05e05bdc64bb1..ddcdb5f4212be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.feature
 
+import org.apache.spark.sql.types.{StringType, StructType, StructField, DoubleType}
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param.ParamsSuite
@@ -165,4 +166,11 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(a === b)
     }
   }
+
+  test("IndexToString.transformSchema (SPARK-10573)") {
+    val idxToStr = new IndexToString().setInputCol("input").setOutputCol("output")
+    val inSchema = StructType(Seq(StructField("input", DoubleType)))
+    val outSchema = idxToStr.transformSchema(inSchema)
+    assert(outSchema("output").dataType === StringType)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
new file mode 100644
index 0000000000000..652f3adb984d3
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.optim
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.optim.WeightedLeastSquares.Instance
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.rdd.RDD
+
+class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  private var instances: RDD[Instance] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    /*
+       R code:
+
+       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+       b <- c(17, 19, 23, 29)
+       w <- c(1, 2, 3, 4)
+     */
+    instances = sc.parallelize(Seq(
+      Instance(1.0, Vectors.dense(0.0, 5.0).toSparse, 17.0),
+      Instance(2.0, Vectors.dense(1.0, 7.0), 19.0),
+      Instance(3.0, Vectors.dense(2.0, 11.0), 23.0),
+      Instance(4.0, Vectors.dense(3.0, 13.0), 29.0)
+    ), 2)
+  }
+
+  test("WLS against lm") {
+    /*
+       R code:
+
+       df <- as.data.frame(cbind(A, b))
+       for (formula in c(b ~ . -1, b ~ .)) {
+         model <- lm(formula, data=df, weights=w)
+         print(as.vector(coef(model)))
+       }
+
+       [1] -3.727121  3.009983
+       [1] 18.08  6.08 -0.60
+     */
+
+    val expected = Seq(
+      Vectors.dense(0.0, -3.727121, 3.009983),
+      Vectors.dense(18.08, 6.08, -0.60))
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true)) {
+      val wls = new WeightedLeastSquares(
+        fitIntercept, regParam = 0.0, standardizeFeatures = false, standardizeLabel = false)
+        .fit(instances)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual ~== expected(idx) absTol 1e-4)
+      idx += 1
+    }
+  }
+
+  test("WLS against glmnet") {
+    /*
+       R code:
+
+       library(glmnet)
+
+       for (intercept in c(FALSE, TRUE)) {
+         for (lambda in c(0.0, 0.1, 1.0)) {
+           for (standardize in c(FALSE, TRUE)) {
+             model <- glmnet(A, b, weights=w, intercept=intercept, lambda=lambda,
+                             standardize=standardize, alpha=0, thresh=1E-14)
+             print(as.vector(coef(model)))
+           }
+         }
+       }
+
+       [1]  0.000000 -3.727117  3.009982
+       [1]  0.000000 -3.727117  3.009982
+       [1]  0.000000 -3.307532  2.924206
+       [1]  0.000000 -2.914790  2.840627
+       [1]  0.000000 -1.526575  2.558158
+       [1] 0.00000000 0.06984238 2.20488344
+       [1] 18.0799727  6.0799832 -0.5999941
+       [1] 18.0799727  6.0799832 -0.5999941
+       [1] 13.5356178  3.2714044  0.3770744
+       [1] 14.064629  3.565802  0.269593
+       [1] 10.1238013  0.9708569  1.1475466
+       [1] 13.1860638  2.1761382  0.6213134
+     */
+
+    val expected = Seq(
+      Vectors.dense(0.0, -3.727117, 3.009982),
+      Vectors.dense(0.0, -3.727117, 3.009982),
+      Vectors.dense(0.0, -3.307532, 2.924206),
+      Vectors.dense(0.0, -2.914790, 2.840627),
+      Vectors.dense(0.0, -1.526575, 2.558158),
+      Vectors.dense(0.0, 0.06984238, 2.20488344),
+      Vectors.dense(18.0799727, 6.0799832, -0.5999941),
+      Vectors.dense(18.0799727, 6.0799832, -0.5999941),
+      Vectors.dense(13.5356178, 3.2714044, 0.3770744),
+      Vectors.dense(14.064629, 3.565802, 0.269593),
+      Vectors.dense(10.1238013, 0.9708569, 1.1475466),
+      Vectors.dense(13.1860638, 2.1761382, 0.6213134))
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true);
+         regParam <- Seq(0.0, 0.1, 1.0);
+         standardizeFeatures <- Seq(false, true)) {
+      val wls = new WeightedLeastSquares(
+        fitIntercept, regParam, standardizeFeatures, standardizeLabel = true)
+        .fit(instances)
+      val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
+      assert(actual ~== expected(idx) absTol 1e-4)
+      idx += 1
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index 2c878f8372a47..dfab82c8b67ad 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -40,6 +40,10 @@ class ParamsSuite extends SparkFunSuite {
 
     assert(inputCol.toString === s"${uid}__inputCol")
 
+    intercept[java.util.NoSuchElementException] {
+      solver.getOrDefault(solver.handleInvalid)
+    }
+
     intercept[IllegalArgumentException] {
       solver.setMaxIter(-1)
     }
@@ -102,12 +106,13 @@ class ParamsSuite extends SparkFunSuite {
 
   test("params") {
     val solver = new TestParams()
-    import solver.{maxIter, inputCol}
+    import solver.{handleInvalid, maxIter, inputCol}
 
     val params = solver.params
-    assert(params.length === 2)
-    assert(params(0).eq(inputCol), "params must be ordered by name")
-    assert(params(1).eq(maxIter))
+    assert(params.length === 3)
+    assert(params(0).eq(handleInvalid), "params must be ordered by name")
+    assert(params(1).eq(inputCol), "params must be ordered by name")
+    assert(params(2).eq(maxIter))
 
     assert(!solver.isSet(maxIter))
     assert(solver.isDefined(maxIter))
@@ -122,7 +127,7 @@ class ParamsSuite extends SparkFunSuite {
     assert(solver.explainParam(maxIter) ===
       "maxIter: maximum number of iterations (>= 0) (default: 10, current: 100)")
     assert(solver.explainParams() ===
-      Seq(inputCol, maxIter).map(solver.explainParam).mkString("\n"))
+      Seq(handleInvalid, inputCol, maxIter).map(solver.explainParam).mkString("\n"))
 
     assert(solver.getParam("inputCol").eq(inputCol))
     assert(solver.getParam("maxIter").eq(maxIter))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
index 2759248344531..9d23547f28447 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.ml.param
 
-import org.apache.spark.ml.param.shared.{HasInputCol, HasMaxIter}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasMaxIter}
 import org.apache.spark.ml.util.Identifiable
 
 /** A subclass of Params for testing. */
-class TestParams(override val uid: String) extends Params with HasMaxIter with HasInputCol {
+class TestParams(override val uid: String) extends Params with HasHandleInvalid with HasMaxIter
+    with HasInputCol {
 
   def this() = this(Identifiable.randomUID("testParams"))
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
new file mode 100644
index 0000000000000..359f31027172b
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -0,0 +1,335 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.regression
+
+import scala.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.MLTestingUtils
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.random.{ExponentialGenerator, WeibullGenerator}
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{Row, DataFrame}
+
+class AFTSurvivalRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  @transient var datasetUnivariate: DataFrame = _
+  @transient var datasetMultivariate: DataFrame = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    datasetUnivariate = sqlContext.createDataFrame(
+      sc.parallelize(generateAFTInput(
+        1, Array(5.5), Array(0.8), 1000, 42, 1.0, 2.0, 2.0)))
+    datasetMultivariate = sqlContext.createDataFrame(
+      sc.parallelize(generateAFTInput(
+        2, Array(0.9, -1.3), Array(0.7, 1.2), 1000, 42, 1.5, 2.5, 2.0)))
+  }
+
+  test("params") {
+    ParamsSuite.checkParams(new AFTSurvivalRegression)
+    val model = new AFTSurvivalRegressionModel("aftSurvReg", Vectors.dense(0.0), 0.0, 0.0)
+    ParamsSuite.checkParams(model)
+  }
+
+  test("aft survival regression: default params") {
+    val aftr = new AFTSurvivalRegression
+    assert(aftr.getLabelCol === "label")
+    assert(aftr.getFeaturesCol === "features")
+    assert(aftr.getPredictionCol === "prediction")
+    assert(aftr.getCensorCol === "censor")
+    assert(aftr.getFitIntercept)
+    assert(aftr.getMaxIter === 100)
+    assert(aftr.getTol === 1E-6)
+    val model = aftr.setQuantileProbabilities(Array(0.1, 0.8))
+      .setQuantilesCol("quantiles")
+      .fit(datasetUnivariate)
+
+    // copied model must have the same parent.
+    MLTestingUtils.checkCopy(model)
+
+    model.transform(datasetUnivariate)
+      .select("label", "prediction", "quantiles")
+      .collect()
+    assert(model.getFeaturesCol === "features")
+    assert(model.getPredictionCol === "prediction")
+    assert(model.getQuantileProbabilities === Array(0.1, 0.8))
+    assert(model.getQuantilesCol === "quantiles")
+    assert(model.intercept !== 0.0)
+    assert(model.hasParent)
+  }
+
+  def generateAFTInput(
+      numFeatures: Int,
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      weibullShape: Double,
+      weibullScale: Double,
+      exponentialMean: Double): Seq[AFTPoint] = {
+
+    def censor(x: Double, y: Double): Double = { if (x <= y) 1.0 else 0.0 }
+
+    val weibull = new WeibullGenerator(weibullShape, weibullScale)
+    weibull.setSeed(seed)
+
+    val exponential = new ExponentialGenerator(exponentialMean)
+    exponential.setSeed(seed)
+
+    val rnd = new Random(seed)
+    val x = Array.fill[Array[Double]](nPoints)(Array.fill[Double](numFeatures)(rnd.nextDouble()))
+
+    x.foreach { v =>
+      var i = 0
+      val len = v.length
+      while (i < len) {
+        v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        i += 1
+      }
+    }
+    val y = (1 to nPoints).map { i => (weibull.nextValue(), exponential.nextValue()) }
+
+    y.zip(x).map { p => AFTPoint(Vectors.dense(p._2), p._1._1, censor(p._1._1, p._1._2)) }
+  }
+
+  test("aft survival regression with univariate") {
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val trainer = new AFTSurvivalRegression()
+      .setQuantileProbabilities(quantileProbabilities)
+      .setQuantilesCol("quantiles")
+    val model = trainer.fit(datasetUnivariate)
+
+    /*
+       Using the following R code to load the data and train the model using survival package.
+
+       library("survival")
+       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+       features <- data$V1
+       censor <- data$V2
+       label <- data$V3
+       sr.fit <- survreg(Surv(label, censor) ~ features, dist='weibull')
+       summary(sr.fit)
+
+                    Value Std. Error      z        p
+       (Intercept)  1.759     0.4141  4.247 2.16e-05
+       features    -0.039     0.0735 -0.531 5.96e-01
+       Log(scale)   0.344     0.0379  9.073 1.16e-19
+
+       Scale= 1.41
+
+       Weibull distribution
+       Loglik(model)= -1152.2   Loglik(intercept only)= -1152.3
+           Chisq= 0.28 on 1 degrees of freedom, p= 0.6
+       Number of Newton-Raphson Iterations: 5
+       n= 1000
+     */
+    val coefficientsR = Vectors.dense(-0.039)
+    val interceptR = 1.759
+    val scaleR = 1.41
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.scale ~== scaleR relTol 1E-3)
+
+    /*
+       Using the following R code to predict.
+
+       testdata <- list(features=6.559282795753792)
+       responsePredict <- predict(sr.fit, newdata=testdata)
+       responsePredict
+
+              1
+       4.494763
+
+       quantilePredict <- predict(sr.fit, newdata=testdata, type='quantile', p=c(0.1, 0.5, 0.9))
+       quantilePredict
+
+       [1]  0.1879174  2.6801195 14.5779394
+     */
+    val features = Vectors.dense(6.559282795753792)
+    val responsePredictR = 4.494763
+    val quantilePredictR = Vectors.dense(0.1879174, 2.6801195, 14.5779394)
+
+    assert(model.predict(features) ~== responsePredictR relTol 1E-3)
+    assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
+
+    model.transform(datasetUnivariate).select("features", "prediction", "quantiles")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double, quantiles: Vector) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+          assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
+    }
+  }
+
+  test("aft survival regression with multivariate") {
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val trainer = new AFTSurvivalRegression()
+      .setQuantileProbabilities(quantileProbabilities)
+      .setQuantilesCol("quantiles")
+    val model = trainer.fit(datasetMultivariate)
+
+    /*
+       Using the following R code to load the data and train the model using survival package.
+
+       library("survival")
+       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+       feature1 <- data$V1
+       feature2 <- data$V2
+       censor <- data$V3
+       label <- data$V4
+       sr.fit <- survreg(Surv(label, censor) ~ feature1 + feature2, dist='weibull')
+       summary(sr.fit)
+
+                     Value Std. Error      z        p
+       (Intercept)  1.9206     0.1057 18.171 8.78e-74
+       feature1    -0.0844     0.0611 -1.381 1.67e-01
+       feature2     0.0677     0.0468  1.447 1.48e-01
+       Log(scale)  -0.0236     0.0436 -0.542 5.88e-01
+
+       Scale= 0.977
+
+       Weibull distribution
+       Loglik(model)= -1070.7   Loglik(intercept only)= -1072.7
+           Chisq= 3.91 on 2 degrees of freedom, p= 0.14
+       Number of Newton-Raphson Iterations: 5
+       n= 1000
+     */
+    val coefficientsR = Vectors.dense(-0.0844, 0.0677)
+    val interceptR = 1.9206
+    val scaleR = 0.977
+
+    assert(model.intercept ~== interceptR relTol 1E-3)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.scale ~== scaleR relTol 1E-3)
+
+    /*
+       Using the following R code to predict.
+       testdata <- list(feature1=2.233396950271428, feature2=-2.5321374085997683)
+       responsePredict <- predict(sr.fit, newdata=testdata)
+       responsePredict
+
+              1
+       4.761219
+
+       quantilePredict <- predict(sr.fit, newdata=testdata, type='quantile', p=c(0.1, 0.5, 0.9))
+       quantilePredict
+
+       [1]  0.5287044  3.3285858 10.7517072
+     */
+    val features = Vectors.dense(2.233396950271428, -2.5321374085997683)
+    val responsePredictR = 4.761219
+    val quantilePredictR = Vectors.dense(0.5287044, 3.3285858, 10.7517072)
+
+    assert(model.predict(features) ~== responsePredictR relTol 1E-3)
+    assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
+
+    model.transform(datasetMultivariate).select("features", "prediction", "quantiles")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double, quantiles: Vector) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+          assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
+    }
+  }
+
+  test("aft survival regression w/o intercept") {
+    val quantileProbabilities = Array(0.1, 0.5, 0.9)
+    val trainer = new AFTSurvivalRegression()
+      .setQuantileProbabilities(quantileProbabilities)
+      .setQuantilesCol("quantiles")
+      .setFitIntercept(false)
+    val model = trainer.fit(datasetMultivariate)
+
+    /*
+       Using the following R code to load the data and train the model using survival package.
+
+       library("survival")
+       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+       feature1 <- data$V1
+       feature2 <- data$V2
+       censor <- data$V3
+       label <- data$V4
+       sr.fit <- survreg(Surv(label, censor) ~ feature1 + feature2 - 1, dist='weibull')
+       summary(sr.fit)
+
+                   Value Std. Error     z        p
+       feature1    0.896     0.0685  13.1 3.93e-39
+       feature2   -0.709     0.0522 -13.6 5.78e-42
+       Log(scale)  0.420     0.0401  10.5 1.23e-25
+
+       Scale= 1.52
+
+       Weibull distribution
+       Loglik(model)= -1292.4   Loglik(intercept only)= -1072.7
+         Chisq= -439.57 on 1 degrees of freedom, p= 1
+       Number of Newton-Raphson Iterations: 6
+       n= 1000
+     */
+    val coefficientsR = Vectors.dense(0.896, -0.709)
+    val interceptR = 0.0
+    val scaleR = 1.52
+
+    assert(model.intercept === interceptR)
+    assert(model.coefficients ~== coefficientsR relTol 1E-3)
+    assert(model.scale ~== scaleR relTol 1E-3)
+
+    /*
+       Using the following R code to predict.
+       testdata <- list(feature1=2.233396950271428, feature2=-2.5321374085997683)
+       responsePredict <- predict(sr.fit, newdata=testdata)
+       responsePredict
+
+              1
+       44.54465
+
+       quantilePredict <- predict(sr.fit, newdata=testdata, type='quantile', p=c(0.1, 0.5, 0.9))
+       quantilePredict
+
+       [1]   1.452103  25.506077 158.428600
+     */
+    val features = Vectors.dense(2.233396950271428, -2.5321374085997683)
+    val responsePredictR = 44.54465
+    val quantilePredictR = Vectors.dense(1.452103, 25.506077, 158.428600)
+
+    assert(model.predict(features) ~== responsePredictR relTol 1E-3)
+    assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
+
+    model.transform(datasetMultivariate).select("features", "prediction", "quantiles")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double, quantiles: Vector) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+          assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
+    }
+  }
+
+  test("aft survival regression w/o quantiles column") {
+    val trainer = new AFTSurvivalRegression
+    val model = trainer.fit(datasetUnivariate)
+    val outputDf = model.transform(datasetUnivariate)
+
+    assert(outputDf.schema.fieldNames.contains("quantiles") === false)
+
+    outputDf.select("features", "prediction")
+      .collect().foreach {
+        case Row(features: Vector, prediction: Double) =>
+          assert(prediction ~== model.predict(features) relTol 1E-5)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index b092bcd6a7e86..868fb8eecb8bb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -89,6 +89,7 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {
       data: RDD[LabeledPoint],
       dt: DecisionTreeRegressor,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy = dt.getOldStrategy(categoricalFeatures)
     val oldTree = OldDecisionTree.train(data, oldStrategy)
     val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0)
@@ -97,5 +98,6 @@ private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite {
     val oldTreeAsNew = DecisionTreeRegressionModel.fromOld(
       oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldTreeAsNew, newTree)
+    assert(newTree.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index a68197b59193d..09326600e620f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -156,7 +156,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
   */
 }
 
-private object GBTRegressorSuite {
+private object GBTRegressorSuite extends SparkFunSuite {
 
   /**
    * Train 2 models on the given dataset, one using the old API and one using the new API.
@@ -167,6 +167,7 @@ private object GBTRegressorSuite {
       validationData: Option[RDD[LabeledPoint]],
       gbt: GBTRegressor,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldBoostingStrategy = gbt.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Regression)
     val oldGBT = new OldGBT(oldBoostingStrategy)
     val oldModel = oldGBT.run(data)
@@ -174,7 +175,9 @@ private object GBTRegressorSuite {
     val newModel = gbt.fit(newData)
     // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = GBTRegressionModel.fromOld(
-      oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures)
+      oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures, numFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.numFeatures === numFeatures)
+    assert(oldModelAsNew.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index c0ab00b68a2f3..59f4193abc8f0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.regression
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.MLTestingUtils
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
@@ -89,6 +90,10 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(ir.getFeatureIndex === 0)
 
     val model = ir.fit(dataset)
+
+    // copied model must have the same parent.
+    MLTestingUtils.checkCopy(model)
+
     model.transform(dataset)
       .select("label", "features", "prediction", "weight")
       .collect()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 2aaee71ecc734..7cb9471e69860 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.ml.regression
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.MLTestingUtils
-import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.{Vector, DenseVector, Vectors}
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -84,6 +87,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.getPredictionCol === "prediction")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
+    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    assert(model.numFeatures === numFeatures)
   }
 
   test("linear regression with intercept without regularization") {
@@ -510,4 +515,89 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       .zip(testSummary.residuals.select("residuals").collect())
       .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
   }
+
+  test("linear regression with weighted samples"){
+    val (data, weightedData) = {
+      val activeData = LinearDataGenerator.generateLinearInput(
+        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+
+      val rnd = new Random(8392)
+      val signedData = activeData.map { case p: LabeledPoint =>
+        (rnd.nextGaussian() > 0.0, p)
+      }
+
+      val data1 = signedData.flatMap {
+        case (true, p) => Iterator(p, p)
+        case (false, p) => Iterator(p)
+      }
+
+      val weightedSignedData = signedData.flatMap {
+        case (true, LabeledPoint(label, features)) =>
+          Iterator(
+            Instance(label, weight = 1.2, features),
+            Instance(label, weight = 0.8, features)
+          )
+        case (false, LabeledPoint(label, features)) =>
+          Iterator(
+            Instance(label, weight = 0.3, features),
+            Instance(label, weight = 0.1, features),
+            Instance(label, weight = 0.6, features)
+          )
+      }
+
+      val noiseData = LinearDataGenerator.generateLinearInput(
+        2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+      val weightedNoiseData = noiseData.map {
+        case LabeledPoint(label, features) => Instance(label, weight = 0, features)
+      }
+      val data2 = weightedSignedData ++ weightedNoiseData
+
+      (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
+        sqlContext.createDataFrame(sc.parallelize(data2, 4)))
+    }
+
+    val trainer1a = (new LinearRegression).setFitIntercept(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val model1a0 = trainer1a.fit(data)
+    val model1a1 = trainer1a.fit(weightedData)
+    val model1b = trainer1b.fit(weightedData)
+    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+    assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
+    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+    assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+
+    val trainer2a = (new LinearRegression).setFitIntercept(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val model2a0 = trainer2a.fit(data)
+    val model2a1 = trainer2a.fit(weightedData)
+    val model2b = trainer2b.fit(weightedData)
+    assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
+    assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
+    assert(model2a0.weights ~== model2b.weights absTol 1E-3)
+    assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
+
+    val trainer3a = (new LinearRegression).setFitIntercept(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val model3a0 = trainer3a.fit(data)
+    val model3a1 = trainer3a.fit(weightedData)
+    val model3b = trainer3b.fit(weightedData)
+    assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
+    assert(model3a0.weights ~== model3b.weights absTol 1E-3)
+
+    val trainer4a = (new LinearRegression).setFitIntercept(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+    val model4a0 = trainer4a.fit(data)
+    val model4a1 = trainer4a.fit(weightedData)
+    val model4b = trainer4b.fit(weightedData)
+    assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
+    assert(model4a0.weights ~== model4b.weights absTol 1E-3)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 7b1b3f11481de..7e751e4b553b6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -137,6 +137,7 @@ private object RandomForestRegressorSuite extends SparkFunSuite {
       data: RDD[LabeledPoint],
       rf: RandomForestRegressor,
       categoricalFeatures: Map[Int, Int]): Unit = {
+    val numFeatures = data.first().features.size
     val oldStrategy =
       rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity)
     val oldModel = OldRandomForest.trainRegressor(
@@ -147,5 +148,6 @@ private object RandomForestRegressorSuite extends SparkFunSuite {
     val oldModelAsNew = RandomForestRegressionModel.fromOld(
       oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures)
     TreeTests.checkEqual(oldModelAsNew, newModel)
+    assert(newModel.numFeatures === numFeatures)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
new file mode 100644
index 0000000000000..997f574e51f6a
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source.libsvm
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
+
+class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
+  var tempDir: File = _
+  var path: String = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val lines =
+      """
+        |1 1:1.0 3:2.0 5:3.0
+        |0
+        |0 2:4.0 4:5.0 6:6.0
+      """.stripMargin
+    tempDir = Utils.createTempDir()
+    val file = new File(tempDir, "part-00000")
+    Files.write(lines, file, Charsets.US_ASCII)
+    path = tempDir.toURI.toString
+  }
+
+  override def afterAll(): Unit = {
+    Utils.deleteRecursively(tempDir)
+    super.afterAll()
+  }
+
+  test("select as sparse vector") {
+    val df = sqlContext.read.format("libsvm").load(path)
+    assert(df.columns(0) == "label")
+    assert(df.columns(1) == "features")
+    val row1 = df.first()
+    assert(row1.getDouble(0) == 1.0)
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
+  }
+
+  test("select as dense vector") {
+    val df = sqlContext.read.format("libsvm").options(Map("vectorType" -> "dense"))
+      .load(path)
+    assert(df.columns(0) == "label")
+    assert(df.columns(1) == "features")
+    assert(df.count() == 3)
+    val row1 = df.first()
+    assert(row1.getDouble(0) == 1.0)
+    val v = row1.getAs[DenseVector](1)
+    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
+  }
+
+  test("select a vector with specifying the longer dimension") {
+    val df = sqlContext.read.option("numFeatures", "100").format("libsvm")
+      .load(path)
+    val row1 = df.first()
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
index dc852795c7f62..d5c238e9ae164 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
@@ -77,7 +77,8 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // Forest consisting of (full tree) + (internal node with 2 leafs)
     val trees = Array(parent, grandParent).map { root =>
-      new DecisionTreeClassificationModel(root, numClasses = 3).asInstanceOf[DecisionTreeModel]
+      new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3)
+        .asInstanceOf[DecisionTreeModel]
     }
     val importances: Vector = RandomForest.featureImportances(trees, 2)
     val tree2norm = feature0importance + feature1importance
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index d119e0b50a393..96e5ffef7a131 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -126,6 +126,31 @@ class BLASSuite extends SparkFunSuite {
     }
   }
 
+  test("spr") {
+    // test dense vector
+    val alpha = 0.1
+    val x = new DenseVector(Array(1.0, 2, 2.1, 4))
+    val U = new DenseVector(Array(1.0, 2, 2, 3, 3, 3, 4, 4, 4, 4))
+    val expected = new DenseVector(Array(1.1, 2.2, 2.4, 3.21, 3.42, 3.441, 4.4, 4.8, 4.84, 5.6))
+
+    spr(alpha, x, U)
+    assert(U ~== expected absTol 1e-9)
+
+    val matrix33 = new DenseVector(Array(1.0, 2, 3, 4, 5))
+    withClue("Size of vector must match the rank of matrix") {
+      intercept[Exception] {
+        spr(alpha, x, matrix33)
+      }
+    }
+
+    // test sparse vector
+    val sv = new SparseVector(4, Array(0, 3), Array(1.0, 2))
+    val U2 = new DenseVector(Array(1.0, 2, 2, 3, 3, 3, 4, 4, 4, 4))
+    spr(0.1, sv, U2)
+    val expectedSparse = new DenseVector(Array(1.1, 2.0, 2.0, 3.0, 3.0, 3.0, 4.2, 4.0, 4.0, 4.4))
+    assert(U2 ~== expectedSparse absTol 1e-15)
+  }
+
   test("syr") {
     val dA = new DenseMatrix(4, 4,
       Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0, 3.1, 4.6, 3.0, 0.8))
@@ -204,6 +229,7 @@ class BLASSuite extends SparkFunSuite {
     val C14 = C1.copy
     val C15 = C1.copy
     val C16 = C1.copy
+    val C17 = C1.copy
     val expected2 = new DenseMatrix(4, 2, Array(2.0, 1.0, 4.0, 2.0, 4.0, 0.0, 4.0, 3.0))
     val expected3 = new DenseMatrix(4, 2, Array(2.0, 2.0, 4.0, 2.0, 8.0, 0.0, 6.0, 6.0))
     val expected4 = new DenseMatrix(4, 2, Array(5.0, 0.0, 10.0, 5.0, 0.0, 0.0, 5.0, 0.0))
@@ -217,6 +243,10 @@ class BLASSuite extends SparkFunSuite {
     assert(C2 ~== expected2 absTol 1e-15)
     assert(C3 ~== expected3 absTol 1e-15)
     assert(C4 ~== expected3 absTol 1e-15)
+    gemm(1.0, dA, B, 0.0, C17)
+    assert(C17 ~== expected absTol 1e-15)
+    gemm(1.0, sA, B, 0.0, C17)
+    assert(C17 ~== expected absTol 1e-15)
 
     withClue("columns of A don't match the rows of B") {
       intercept[Exception] {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
index a5ca1518f82f5..8416771552fd3 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.random
 
-import scala.math
+import org.apache.commons.math3.special.Gamma
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.StatCounter
@@ -136,4 +136,18 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
         distributionChecks(gamma, expectedMean, expectedStd, 0.1)
     }
   }
+
+  test("WeibullGenerator") {
+    List((1.0, 2.0), (2.0, 3.0), (2.5, 3.5), (10.4, 2.222)).map {
+      case (alpha: Double, beta: Double) =>
+        val weibull = new WeibullGenerator(alpha, beta)
+        apiChecks(weibull)
+
+        val expectedMean = math.exp(Gamma.logGamma(1 + (1 / alpha))) * beta
+        val expectedVariance = math.exp(
+          Gamma.logGamma(1 + (2 / alpha))) * beta * beta - expectedMean * expectedMean
+        val expectedStd = math.sqrt(expectedVariance)
+        distributionChecks(weibull, expectedMean, expectedStd, 0.1)
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
index 07efde4f5e6dc..b6d41db69be0a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
@@ -218,4 +218,31 @@ class MultivariateOnlineSummarizerSuite extends SparkFunSuite {
     s0.merge(s1)
     assert(s0.mean(0) ~== 1.0 absTol 1e-14)
   }
+
+  test("merging summarizer with weighted samples") {
+    val summarizer = (new MultivariateOnlineSummarizer)
+      .add(instance = Vectors.sparse(3, Seq((0, -0.8), (1, 1.7))), weight = 0.1)
+      .add(Vectors.dense(0.0, -1.2, -1.7), 0.2).merge(
+        (new MultivariateOnlineSummarizer)
+          .add(Vectors.sparse(3, Seq((0, -0.7), (1, 0.01), (2, 1.3))), 0.15)
+          .add(Vectors.dense(-0.5, 0.3, -1.5), 0.05))
+
+    assert(summarizer.count === 4)
+
+    // The following values are hand calculated using the formula:
+    // [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]]
+    // which defines the reliability weight used for computing the unbiased estimation of variance
+    // for weighted instances.
+    assert(summarizer.mean ~== Vectors.dense(Array(-0.42, -0.107, -0.44))
+      absTol 1E-10, "mean mismatch")
+    assert(summarizer.variance ~== Vectors.dense(Array(0.17657142857, 1.645115714, 2.42057142857))
+      absTol 1E-8, "variance mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(Array(0.3, 0.5, 0.4))
+      absTol 1E-10, "numNonzeros mismatch")
+    assert(summarizer.max ~== Vectors.dense(Array(0.0, 1.7, 1.3)) absTol 1E-10, "max mismatch")
+    assert(summarizer.min ~== Vectors.dense(Array(-0.8, -1.2, -1.7)) absTol 1E-10, "min mismatch")
+    assert(summarizer.normL2 ~== Vectors.dense(0.387298335, 0.762571308141, 0.9715966241192)
+      absTol 1E-8, "normL2 mismatch")
+    assert(summarizer.normL1 ~== Vectors.dense(0.21, 0.4265, 0.61) absTol 1E-10, "normL1 mismatch")
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala
new file mode 100644
index 0000000000000..d3e9ef4ff079c
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/StreamingTestSuite.scala
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.stat.test.{StreamingTest, StreamingTestResult, StudentTTest, WelchTTest}
+import org.apache.spark.streaming.TestSuiteBase
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.util.StatCounter
+import org.apache.spark.util.random.XORShiftRandom
+
+class StreamingTestSuite extends SparkFunSuite with TestSuiteBase {
+
+  override def maxWaitTimeMillis : Int = 30000
+
+  test("accuracy for null hypothesis using welch t-test") {
+    // set parameters
+    val testMethod = "welch"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = 0
+    val stdevA = 0.001
+    val meanB = 0
+    val stdevB = 0.001
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue > 0.05 && res.method == WelchTTest.methodName))
+  }
+
+  test("accuracy for alternative hypothesis using welch t-test") {
+    // set parameters
+    val testMethod = "welch"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue < 0.05 && res.method == WelchTTest.methodName))
+  }
+
+  test("accuracy for null hypothesis using student t-test") {
+    // set parameters
+    val testMethod = "student"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = 0
+    val stdevA = 0.001
+    val meanB = 0
+    val stdevB = 0.001
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue > 0.05 && res.method == StudentTTest.methodName))
+  }
+
+  test("accuracy for alternative hypothesis using student t-test") {
+    // set parameters
+    val testMethod = "student"
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+      .setTestMethod(testMethod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(res =>
+      res.pValue < 0.05 && res.method == StudentTTest.methodName))
+  }
+
+  test("batches within same test window are grouped") {
+    // set parameters
+    val testWindow = 3
+    val numBatches = 5
+    val pointsPerBatch = 100
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(testWindow)
+      .setPeacePeriod(0)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input,
+      (inputDStream: DStream[(Boolean, Double)]) => model.summarizeByKeyAndWindow(inputDStream))
+    val outputBatches = runStreams[(Boolean, StatCounter)](ssc, numBatches, numBatches)
+    val outputCounts = outputBatches.flatten.map(_._2.count)
+
+    // number of batches seen so far does not exceed testWindow, expect counts to continue growing
+    for (i <- 0 until testWindow) {
+      assert(outputCounts.drop(2 * i).take(2).forall(_ == (i + 1) * pointsPerBatch / 2))
+    }
+
+    // number of batches seen exceeds testWindow, expect counts to be constant
+    assert(outputCounts.drop(2 * (testWindow - 1)).forall(_ == testWindow * pointsPerBatch / 2))
+  }
+
+
+  test("entries in peace period are dropped") {
+    // set parameters
+    val peacePeriod = 3
+    val numBatches = 7
+    val pointsPerBatch = 1000
+    val meanA = -10
+    val stdevA = 1
+    val meanB = 10
+    val stdevB = 1
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(peacePeriod)
+
+    val input = generateTestData(
+      numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.dropPeacePeriod(inputDStream))
+    val outputBatches = runStreams[(Boolean, Double)](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.length == (numBatches - peacePeriod) * pointsPerBatch)
+  }
+
+  test("null hypothesis when only data from one group is present") {
+    // set parameters
+    val numBatches = 2
+    val pointsPerBatch = 1000
+    val meanA = 0
+    val stdevA = 0.001
+    val meanB = 0
+    val stdevB = 0.001
+
+    val model = new StreamingTest()
+      .setWindowSize(0)
+      .setPeacePeriod(0)
+
+    val input = generateTestData(numBatches, pointsPerBatch, meanA, stdevA, meanB, stdevB, 42)
+      .map(batch => batch.filter(_._1)) // only keep one test group
+
+    // setup and run the model
+    val ssc = setupStreams(
+      input, (inputDStream: DStream[(Boolean, Double)]) => model.registerStream(inputDStream))
+    val outputBatches = runStreams[StreamingTestResult](ssc, numBatches, numBatches)
+
+    assert(outputBatches.flatten.forall(result => (result.pValue - 1.0).abs < 0.001))
+  }
+
+  // Generate testing input with half of the entries in group A and half in group B
+  private def generateTestData(
+      numBatches: Int,
+      pointsPerBatch: Int,
+      meanA: Double,
+      stdevA: Double,
+      meanB: Double,
+      stdevB: Double,
+      seed: Int): (IndexedSeq[IndexedSeq[(Boolean, Double)]]) = {
+    val rand = new XORShiftRandom(seed)
+    val numTrues = pointsPerBatch / 2
+    val data = (0 until numBatches).map { i =>
+      (0 until numTrues).map { idx => (true, meanA + stdevA * rand.nextGaussian())} ++
+        (pointsPerBatch / 2 until pointsPerBatch).map { idx =>
+          (false, meanB + stdevB * rand.nextGaussian())
+        }
+    }
+
+    data
+  }
+}
diff --git a/network/common/pom.xml b/network/common/pom.xml
index 7dc3068ab8cb7..1cc054a8936c5 100644
--- a/network/common/pom.xml
+++ b/network/common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -48,6 +48,10 @@
       <artifactId>slf4j-api</artifactId>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>com.google.code.findbugs</groupId>
+      <artifactId>jsr305</artifactId>
+    </dependency>
     <!--
       Promote Guava to "compile" so that maven-shade-plugin picks it up (for packaging the Optional
       class exposed in the Java API). The plugin will then remove this dependency from the published
diff --git a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
index e8e7f06247d3e..df841288a02e3 100644
--- a/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/network/common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -23,6 +23,7 @@
 import java.util.UUID;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
+import javax.annotation.Nullable;
 
 import com.google.common.base.Objects;
 import com.google.common.base.Preconditions;
@@ -70,6 +71,7 @@ public class TransportClient implements Closeable {
 
   private final Channel channel;
   private final TransportResponseHandler handler;
+  @Nullable private String clientId;
 
   public TransportClient(Channel channel, TransportResponseHandler handler) {
     this.channel = Preconditions.checkNotNull(channel);
@@ -84,6 +86,25 @@ public SocketAddress getSocketAddress() {
     return channel.remoteAddress();
   }
 
+  /**
+   * Returns the ID used by the client to authenticate itself when authentication is enabled.
+   *
+   * @return The client ID, or null if authentication is disabled.
+   */
+  public String getClientId() {
+    return clientId;
+  }
+
+  /**
+   * Sets the authenticated client ID. This is meant to be used by the authentication layer.
+   *
+   * Trying to set a different client ID after it's been set will result in an exception.
+   */
+  public void setClientId(String id) {
+    Preconditions.checkState(clientId == null, "Client ID has already been set.");
+    this.clientId = id;
+  }
+
   /**
    * Requests a single chunk from the remote side, from the pre-negotiated streamId.
    *
@@ -207,6 +228,7 @@ public void close() {
   public String toString() {
     return Objects.toStringHelper(this)
       .add("remoteAdress", channel.remoteAddress())
+      .add("clientId", clientId)
       .add("isActive", isActive())
       .toString();
   }
diff --git a/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java b/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
index 185ba2ef3bb1f..69923769d44b4 100644
--- a/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
+++ b/network/common/src/main/java/org/apache/spark/network/sasl/SaslClientBootstrap.java
@@ -77,6 +77,8 @@ public void doBootstrap(TransportClient client, Channel channel) {
         payload = saslClient.response(response);
       }
 
+      client.setClientId(appId);
+
       if (encrypt) {
         if (!SparkSaslServer.QOP_AUTH_CONF.equals(saslClient.getNegotiatedProperty(Sasl.QOP))) {
           throw new RuntimeException(
diff --git a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
index be6165caf3c74..3f2ebe32887b8 100644
--- a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
@@ -81,6 +81,7 @@ public void receive(TransportClient client, byte[] message, RpcResponseCallback
 
     if (saslServer == null) {
       // First message in the handshake, setup the necessary state.
+      client.setClientId(saslMessage.appId);
       saslServer = new SparkSaslServer(saslMessage.appId, secretKeyHolder,
         conf.saslServerAlwaysEncrypt());
     }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java b/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
index c95e64e8e2cda..e671854da1cae 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
@@ -24,13 +24,13 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
 
+import com.google.common.base.Preconditions;
 import io.netty.channel.Channel;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.buffer.ManagedBuffer;
-
-import com.google.common.base.Preconditions;
+import org.apache.spark.network.client.TransportClient;
 
 /**
  * StreamManager which allows registration of an Iterator&lt;ManagedBuffer&gt;, which are individually
@@ -44,6 +44,7 @@ public class OneForOneStreamManager extends StreamManager {
 
   /** State of a single stream. */
   private static class StreamState {
+    final String appId;
     final Iterator<ManagedBuffer> buffers;
 
     // The channel associated to the stream
@@ -53,7 +54,8 @@ private static class StreamState {
     // that the caller only requests each chunk one at a time, in order.
     int curChunk = 0;
 
-    StreamState(Iterator<ManagedBuffer> buffers) {
+    StreamState(String appId, Iterator<ManagedBuffer> buffers) {
+      this.appId = appId;
       this.buffers = Preconditions.checkNotNull(buffers);
     }
   }
@@ -109,15 +111,34 @@ public void connectionTerminated(Channel channel) {
     }
   }
 
+  @Override
+  public void checkAuthorization(TransportClient client, long streamId) {
+    if (client.getClientId() != null) {
+      StreamState state = streams.get(streamId);
+      Preconditions.checkArgument(state != null, "Unknown stream ID.");
+      if (!client.getClientId().equals(state.appId)) {
+        throw new SecurityException(String.format(
+          "Client %s not authorized to read stream %d (app %s).",
+          client.getClientId(),
+          streamId,
+          state.appId));
+      }
+    }
+  }
+
   /**
    * Registers a stream of ManagedBuffers which are served as individual chunks one at a time to
    * callers. Each ManagedBuffer will be release()'d after it is transferred on the wire. If a
    * client connection is closed before the iterator is fully drained, then the remaining buffers
    * will all be release()'d.
+   *
+   * If an app ID is provided, only callers who've authenticated with the given app ID will be
+   * allowed to fetch from this stream.
    */
-  public long registerStream(Iterator<ManagedBuffer> buffers) {
+  public long registerStream(String appId, Iterator<ManagedBuffer> buffers) {
     long myStreamId = nextStreamId.getAndIncrement();
-    streams.put(myStreamId, new StreamState(buffers));
+    streams.put(myStreamId, new StreamState(appId, buffers));
     return myStreamId;
   }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java b/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
index 929f789bf9d24..aaa677c965640 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/StreamManager.java
@@ -20,6 +20,7 @@
 import io.netty.channel.Channel;
 
 import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.client.TransportClient;
 
 /**
  * The StreamManager is used to fetch individual chunks from a stream. This is used in
@@ -60,4 +61,12 @@ public void registerChannel(Channel channel, long streamId) { }
    * to read from the associated streams again, so any state can be cleaned up.
    */
   public void connectionTerminated(Channel channel) { }
+
+  /**
+   * Verify that the client is authorized to read from the given stream.
+   *
+   * @throws SecurityException If client is not authorized.
+   */
+  public void checkAuthorization(TransportClient client, long streamId) { }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index e5159ab56d0d4..df6027805838d 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -97,6 +97,7 @@ private void processFetchRequest(final ChunkFetchRequest req) {
 
     ManagedBuffer buf;
     try {
+      streamManager.checkAuthorization(reverseClient, req.streamChunkId.streamId);
       streamManager.registerChannel(channel, req.streamChunkId.streamId);
       buf = streamManager.getChunk(req.streamChunkId.streamId, req.streamChunkId.chunkIndex);
     } catch (Exception e) {
diff --git a/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java b/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
index 57113ed12d414..922c37a10efdd 100644
--- a/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
+++ b/network/common/src/main/java/org/apache/spark/network/util/LimitedInputStream.java
@@ -15,6 +15,24 @@
  * limitations under the License.
  */
 
+/*
+ * Based on LimitedInputStream.java from Google Guava
+ *
+ * Copyright (C) 2007 The Guava Authors
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
 package org.apache.spark.network.util;
 
 import java.io.FilterInputStream;
diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
index 3d2edf9d94515..7a66c968041ce 100644
--- a/network/shuffle/pom.xml
+++ b/network/shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
index 0df1dd621f6e9..3ddf5c3c39189 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
@@ -58,7 +58,7 @@ public ExternalShuffleBlockHandler(TransportConf conf, File registeredExecutorFi
 
   /** Enables mocking out the StreamManager and BlockManager. */
   @VisibleForTesting
-  ExternalShuffleBlockHandler(
+  public ExternalShuffleBlockHandler(
       OneForOneStreamManager streamManager,
       ExternalShuffleBlockResolver blockManager) {
     this.streamManager = streamManager;
@@ -77,17 +77,19 @@ protected void handleMessage(
       RpcResponseCallback callback) {
     if (msgObj instanceof OpenBlocks) {
       OpenBlocks msg = (OpenBlocks) msgObj;
-      List<ManagedBuffer> blocks = Lists.newArrayList();
+      checkAuth(client, msg.appId);
 
+      List<ManagedBuffer> blocks = Lists.newArrayList();
       for (String blockId : msg.blockIds) {
         blocks.add(blockManager.getBlockData(msg.appId, msg.execId, blockId));
       }
-      long streamId = streamManager.registerStream(blocks.iterator());
+      long streamId = streamManager.registerStream(client.getClientId(), blocks.iterator());
       logger.trace("Registered streamId {} with {} buffers", streamId, msg.blockIds.length);
       callback.onSuccess(new StreamHandle(streamId, msg.blockIds.length).toByteArray());
 
     } else if (msgObj instanceof RegisterExecutor) {
       RegisterExecutor msg = (RegisterExecutor) msgObj;
+      checkAuth(client, msg.appId);
       blockManager.registerExecutor(msg.appId, msg.execId, msg.executorInfo);
       callback.onSuccess(new byte[0]);
 
@@ -126,4 +128,12 @@ public void reregisterExecutor(AppExecId appExecId, ExecutorShuffleInfo executor
   public void close() {
     blockManager.close();
   }
+
+  private void checkAuth(TransportClient client, String appId) {
+    if (client.getClientId() != null && !client.getClientId().equals(appId)) {
+      throw new SecurityException(String.format(
+        "Client for %s not authorized for application %s.", client.getClientId(), appId));
+    }
+  }
+
 }
diff --git a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
index 79beec4429a99..0d4dd6afac769 100644
--- a/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
+++ b/network/shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -50,9 +50,6 @@
  * of Executors. Each Executor must register its own configuration about where it stores its files
  * (local dirs) and how (shuffle manager). The logic for retrieval of individual files is replicated
  * from Spark's FileShuffleBlockResolver and IndexShuffleBlockResolver.
- *
- * Executors with shuffle file consolidation are not currently supported, as the index is stored in
- * the Executor's memory, unlike the IndexShuffleBlockResolver.
  */
 public class ExternalShuffleBlockResolver {
   private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class);
@@ -117,10 +114,14 @@ public ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorF
             "recover state for existing applications", registeredExecutorFile, e);
           if (registeredExecutorFile.isDirectory()) {
             for (File f : registeredExecutorFile.listFiles()) {
-              f.delete();
+              if (!f.delete()) {
+                logger.warn("error deleting {}", f.getPath());
+              }
             }
           }
-          registeredExecutorFile.delete();
+          if (!registeredExecutorFile.delete()) {
+            logger.warn("error deleting {}", registeredExecutorFile.getPath());
+          }
           options.createIfMissing(true);
           try {
             tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
@@ -254,7 +255,6 @@ private void deleteExecutorDirs(String[] dirs) {
    * Hash-based shuffle data is simply stored as one file per block.
    * This logic is from FileShuffleBlockResolver.
    */
-  // TODO: Support consolidated hash shuffle files
   private ManagedBuffer getHashBasedShuffleBlockData(ExecutorShuffleInfo executor, String blockId) {
     File shuffleFile = getFile(executor.localDirs, executor.subDirsPerLocalDir, blockId);
     return new FileSegmentManagedBuffer(conf, shuffleFile, 0, shuffleFile.length());
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
index 382f613ecbb1b..c393a5e1e6810 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicReference;
 
 import com.google.common.collect.Lists;
 import org.junit.After;
@@ -27,9 +28,12 @@
 import org.junit.Test;
 
 import static org.junit.Assert.*;
+import static org.mockito.Mockito.*;
 
 import org.apache.spark.network.TestUtils;
 import org.apache.spark.network.TransportContext;
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.client.ChunkReceivedCallback;
 import org.apache.spark.network.client.RpcResponseCallback;
 import org.apache.spark.network.client.TransportClient;
 import org.apache.spark.network.client.TransportClientBootstrap;
@@ -39,44 +43,44 @@
 import org.apache.spark.network.server.StreamManager;
 import org.apache.spark.network.server.TransportServer;
 import org.apache.spark.network.server.TransportServerBootstrap;
+import org.apache.spark.network.shuffle.BlockFetchingListener;
 import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler;
+import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver;
+import org.apache.spark.network.shuffle.OneForOneBlockFetcher;
+import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
+import org.apache.spark.network.shuffle.protocol.OpenBlocks;
+import org.apache.spark.network.shuffle.protocol.RegisterExecutor;
+import org.apache.spark.network.shuffle.protocol.StreamHandle;
 import org.apache.spark.network.util.SystemPropertyConfigProvider;
 import org.apache.spark.network.util.TransportConf;
 
 public class SaslIntegrationSuite {
-  static ExternalShuffleBlockHandler handler;
+
+  // Use a long timeout to account for slow / overloaded build machines. In the normal case,
+  // tests should finish way before the timeout expires.
+  private final static long TIMEOUT_MS = 10_000;
+
   static TransportServer server;
   static TransportConf conf;
   static TransportContext context;
+  static SecretKeyHolder secretKeyHolder;
 
   TransportClientFactory clientFactory;
 
-  /** Provides a secret key holder which always returns the given secret key. */
-  static class TestSecretKeyHolder implements SecretKeyHolder {
-
-    private final String secretKey;
-
-    TestSecretKeyHolder(String secretKey) {
-      this.secretKey = secretKey;
-    }
-
-    @Override
-    public String getSaslUser(String appId) {
-      return "user";
-    }
-    @Override
-    public String getSecretKey(String appId) {
-      return secretKey;
-    }
-  }
-
-
   @BeforeClass
   public static void beforeAll() throws IOException {
-    SecretKeyHolder secretKeyHolder = new TestSecretKeyHolder("good-key");
     conf = new TransportConf(new SystemPropertyConfigProvider());
     context = new TransportContext(conf, new TestRpcHandler());
 
+    secretKeyHolder = mock(SecretKeyHolder.class);
+    when(secretKeyHolder.getSaslUser(eq("app-1"))).thenReturn("app-1");
+    when(secretKeyHolder.getSecretKey(eq("app-1"))).thenReturn("app-1");
+    when(secretKeyHolder.getSaslUser(eq("app-2"))).thenReturn("app-2");
+    when(secretKeyHolder.getSecretKey(eq("app-2"))).thenReturn("app-2");
+    when(secretKeyHolder.getSaslUser(anyString())).thenReturn("other-app");
+    when(secretKeyHolder.getSecretKey(anyString())).thenReturn("correct-password");
+
     TransportServerBootstrap bootstrap = new SaslServerBootstrap(conf, secretKeyHolder);
     server = context.createServer(Arrays.asList(bootstrap));
   }
@@ -99,23 +103,27 @@ public void afterEach() {
   public void testGoodClient() throws IOException {
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
-        new SaslClientBootstrap(conf, "app-id", new TestSecretKeyHolder("good-key"))));
+        new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
 
     TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
     String msg = "Hello, World!";
-    byte[] resp = client.sendRpcSync(msg.getBytes(), 1000);
+    byte[] resp = client.sendRpcSync(msg.getBytes(), TIMEOUT_MS);
     assertEquals(msg, new String(resp)); // our rpc handler should just return the given msg
   }
 
   @Test
   public void testBadClient() {
+    SecretKeyHolder badKeyHolder = mock(SecretKeyHolder.class);
+    when(badKeyHolder.getSaslUser(anyString())).thenReturn("other-app");
+    when(badKeyHolder.getSecretKey(anyString())).thenReturn("wrong-password");
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
-        new SaslClientBootstrap(conf, "app-id", new TestSecretKeyHolder("bad-key"))));
+        new SaslClientBootstrap(conf, "unknown-app", badKeyHolder)));
 
     try {
       // Bootstrap should fail on startup.
       clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+      fail("Connection should have failed.");
     } catch (Exception e) {
       assertTrue(e.getMessage(), e.getMessage().contains("Mismatched response"));
     }
@@ -128,7 +136,7 @@ public void testNoSaslClient() throws IOException {
 
     TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
     try {
-      client.sendRpcSync(new byte[13], 1000);
+      client.sendRpcSync(new byte[13], TIMEOUT_MS);
       fail("Should have failed");
     } catch (Exception e) {
       assertTrue(e.getMessage(), e.getMessage().contains("Expected SaslMessage"));
@@ -136,7 +144,7 @@ public void testNoSaslClient() throws IOException {
 
     try {
       // Guessing the right tag byte doesn't magically get you in...
-      client.sendRpcSync(new byte[] { (byte) 0xEA }, 1000);
+      client.sendRpcSync(new byte[] { (byte) 0xEA }, TIMEOUT_MS);
       fail("Should have failed");
     } catch (Exception e) {
       assertTrue(e.getMessage(), e.getMessage().contains("java.lang.IndexOutOfBoundsException"));
@@ -149,7 +157,7 @@ public void testNoSaslServer() {
     TransportContext context = new TransportContext(conf, handler);
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
-        new SaslClientBootstrap(conf, "app-id", new TestSecretKeyHolder("key"))));
+        new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
     TransportServer server = context.createServer();
     try {
       clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
@@ -160,6 +168,110 @@ public void testNoSaslServer() {
     }
   }
 
+  /**
+   * This test is not actually testing SASL behavior, but testing that the shuffle service
+   * performs correct authorization checks based on the SASL authentication data.
+   */
+  @Test
+  public void testAppIsolation() throws Exception {
+    // Start a new server with the correct RPC handler to serve block data.
+    ExternalShuffleBlockResolver blockResolver = mock(ExternalShuffleBlockResolver.class);
+    ExternalShuffleBlockHandler blockHandler = new ExternalShuffleBlockHandler(
+      new OneForOneStreamManager(), blockResolver);
+    TransportServerBootstrap bootstrap = new SaslServerBootstrap(conf, secretKeyHolder);
+    TransportContext blockServerContext = new TransportContext(conf, blockHandler);
+    TransportServer blockServer = blockServerContext.createServer(Arrays.asList(bootstrap));
+
+    TransportClient client1 = null;
+    TransportClient client2 = null;
+    TransportClientFactory clientFactory2 = null;
+    try {
+      // Create a client, and make a request to fetch blocks from a different app.
+      clientFactory = blockServerContext.createClientFactory(
+        Lists.<TransportClientBootstrap>newArrayList(
+          new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
+      client1 = clientFactory.createClient(TestUtils.getLocalHost(),
+        blockServer.getPort());
+
+      final AtomicReference<Throwable> exception = new AtomicReference<>();
+
+      BlockFetchingListener listener = new BlockFetchingListener() {
+        @Override
+        public synchronized void onBlockFetchSuccess(String blockId, ManagedBuffer data) {
+          notifyAll();
+        }
+
+        @Override
+        public synchronized void onBlockFetchFailure(String blockId, Throwable t) {
+          exception.set(t);
+          notifyAll();
+        }
+      };
+
+      String[] blockIds = new String[] { "shuffle_2_3_4", "shuffle_6_7_8" };
+      OneForOneBlockFetcher fetcher = new OneForOneBlockFetcher(client1, "app-2", "0",
+        blockIds, listener);
+      synchronized (listener) {
+        fetcher.start();
+        listener.wait();
+      }
+      checkSecurityException(exception.get());
+
+      // Register an executor so that the next steps work.
+      ExecutorShuffleInfo executorInfo = new ExecutorShuffleInfo(
+        new String[] { System.getProperty("java.io.tmpdir") }, 1,
+        "org.apache.spark.shuffle.sort.SortShuffleManager");
+      RegisterExecutor regmsg = new RegisterExecutor("app-1", "0", executorInfo);
+      client1.sendRpcSync(regmsg.toByteArray(), TIMEOUT_MS);
+
+      // Make a successful request to fetch blocks, which creates a new stream. But do not actually
+      // fetch any blocks, to keep the stream open.
+      OpenBlocks openMessage = new OpenBlocks("app-1", "0", blockIds);
+      byte[] response = client1.sendRpcSync(openMessage.toByteArray(), TIMEOUT_MS);
+      StreamHandle stream = (StreamHandle) BlockTransferMessage.Decoder.fromByteArray(response);
+      long streamId = stream.streamId;
+
+      // Create a second client, authenticated with a different app ID, and try to read from
+      // the stream created for the previous app.
+      clientFactory2 = blockServerContext.createClientFactory(
+        Lists.<TransportClientBootstrap>newArrayList(
+          new SaslClientBootstrap(conf, "app-2", secretKeyHolder)));
+      client2 = clientFactory2.createClient(TestUtils.getLocalHost(),
+        blockServer.getPort());
+
+      ChunkReceivedCallback callback = new ChunkReceivedCallback() {
+        @Override
+        public synchronized void onSuccess(int chunkIndex, ManagedBuffer buffer) {
+          notifyAll();
+        }
+
+        @Override
+        public synchronized void onFailure(int chunkIndex, Throwable t) {
+          exception.set(t);
+          notifyAll();
+        }
+      };
+
+      exception.set(null);
+      synchronized (callback) {
+        client2.fetchChunk(streamId, 0, callback);
+        callback.wait();
+      }
+      checkSecurityException(exception.get());
+    } finally {
+      if (client1 != null) {
+        client1.close();
+      }
+      if (client2 != null) {
+        client2.close();
+      }
+      if (clientFactory2 != null) {
+        clientFactory2.close();
+      }
+      blockServer.close();
+    }
+  }
+
   /** RPC handler which simply responds with the message it received. */
   public static class TestRpcHandler extends RpcHandler {
     @Override
@@ -172,4 +284,10 @@ public StreamManager getStreamManager() {
       return new OneForOneStreamManager();
     }
   }
+
+  private void checkSecurityException(Throwable t) {
+    assertNotNull("No exception was caught.", t);
+    assertTrue("Expected SecurityException.",
+      t.getMessage().contains(SecurityException.class.getName()));
+  }
 }
diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java
index 1d197497b7c8f..e61390cf57061 100644
--- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java
+++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandlerSuite.java
@@ -93,7 +93,7 @@ public void testOpenShuffleBlocks() {
     @SuppressWarnings("unchecked")
     ArgumentCaptor<Iterator<ManagedBuffer>> stream = (ArgumentCaptor<Iterator<ManagedBuffer>>)
         (ArgumentCaptor<?>) ArgumentCaptor.forClass(Iterator.class);
-    verify(streamManager, times(1)).registerStream(stream.capture());
+    verify(streamManager, times(1)).registerStream(anyString(), stream.capture());
     Iterator<ManagedBuffer> buffers = stream.getValue();
     assertEquals(block0Marker, buffers.next());
     assertEquals(block1Marker, buffers.next());
diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml
index a99f7c4392d3d..e745180eace78 100644
--- a/network/yarn/pom.xml
+++ b/network/yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 88ebceca769e9..6535994641145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.10</artifactId>
-  <version>1.5.0-SNAPSHOT</version>
+  <version>1.6.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
@@ -87,7 +87,7 @@
 
   <modules>
     <module>core</module>
-    <module>bagel</module>
+    <module>bagel</module> <!-- Deprecated -->
     <module>graphx</module>
     <module>mllib</module>
     <module>tools</module>
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index f16bf989f200b..519052620246f 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -91,7 +91,7 @@ object MimaBuild {
 
   def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "1.4.0"
+    val previousSparkVersion = "1.5.0"
     val fullId = "spark-" + projectRef.project + "_2.10"
     mimaDefaultSettings ++
     Seq(previousArtifact := Some(organization % fullId % previousSparkVersion),
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 88745dc086a04..b2e6be706637b 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -32,634 +32,697 @@ import com.typesafe.tools.mima.core.ProblemFilters._
  * MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap")
  */
 object MimaExcludes {
-    def excludes(version: String) =
-      version match {
-        case v if v.startsWith("1.5") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            // These are needed if checking against the sbt build, since they are part of
-            // the maven-generated artifacts in 1.3.
-            excludePackage("org.spark-project.jetty"),
-            MimaBuild.excludeSparkPackage("unused"),
-            // JavaRDDLike is not meant to be extended by user programs
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.partitioner"),
-            // Modification of private static method
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.streaming.kafka.KafkaUtils.org$apache$spark$streaming$kafka$KafkaUtils$$leadersForRanges"),
-            // Mima false positive (was a private[spark] class)
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.util.collection.PairIterator"),
-            // Removing a testing method from a private class
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.kafka.KafkaTestUtils.waitUntilLeaderOffset"),
-            // While private MiMa is still not happy about the changes,
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.ml.regression.LeastSquaresAggregator.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.ml.classification.LogisticCostFun.this"),
-            // SQL execution is considered private.
-            excludePackage("org.apache.spark.sql.execution"),
-            // The old JSON RDD is removed in favor of streaming Jackson
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"),
-            // local function inside a method
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
-          ) ++ Seq(
-            // SPARK-8479 Add numNonzeros and numActives to Matrix.
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.numNonzeros"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.numActives")
-          ) ++ Seq(
-            // SPARK-8914 Remove RDDApi
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.RDDApi")
-          ) ++ Seq(
-            // SPARK-7292 Provide operator to truncate lineage cheaply
-            ProblemFilters.exclude[AbstractClassProblem](
-              "org.apache.spark.rdd.RDDCheckpointData"),
-            ProblemFilters.exclude[AbstractClassProblem](
-              "org.apache.spark.rdd.CheckpointRDD")
-          ) ++ Seq(
-            // SPARK-8701 Add input metadata in the batch page.
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.streaming.scheduler.InputInfo$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.streaming.scheduler.InputInfo")
-          ) ++ Seq(
-            // SPARK-6797 Support YARN modes for SparkR
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.PairwiseRRDD.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.RRDD.createRWorker"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.RRDD.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.StringRRDD.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.r.BaseRRDD.this")
-          ) ++ Seq(
-            // SPARK-7422 add argmax for sparse vectors
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.argmax")
-          ) ++ Seq(
-            // SPARK-8906 Move all internal data source classes into execution.datasources
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopPartition"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DefaultWriterContainer"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DynamicPartitionWriterContainer"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.BaseWriterContainer"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CaseInsensitiveMap"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException"),
-            // SPARK-9763 Minimize exposure of internal SQL classes
-            excludePackage("org.apache.spark.sql.parquet"),
-            excludePackage("org.apache.spark.sql.json"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$JDBCConversion"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$DriverWrapper"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DefaultSource"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation")
-          ) ++ Seq(
-            // SPARK-4751 Dynamic allocation for standalone mode
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.SparkContext.supportDynamicAllocation")
-          ) ++ Seq(
-            // SPARK-9580: Remove SQL test singletons
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.LocalSQLContext$SQLSession"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.LocalSQLContext"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.TestSQLContext"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.test.TestSQLContext$")
-          ) ++ Seq(
-            // SPARK-9704 Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.mllib.linalg.VectorUDT.serialize")
-          )
+  def excludes(version: String) = version match {
+    case v if v.startsWith("1.6") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("network"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in 1.3.
+        excludePackage("org.spark-project.jetty"),
+        MimaBuild.excludeSparkPackage("unused"),
+        // SQL execution is considered private.
+        excludePackage("org.apache.spark.sql.execution")
+      ) ++
+      MimaBuild.excludeSparkClass("streaming.flume.FlumeTestUtils") ++
+      MimaBuild.excludeSparkClass("streaming.flume.PollingFlumeTestUtils") ++
+      Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticCostFun.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticAggregator.add"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticAggregator.count")
+      ) ++ Seq(
+        // SPARK-10381 Fix types / units in private AskPermissionToCommitOutput RPC message.
+        // This class is marked as `private` but MiMa still seems to be confused by the change.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.task"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$2"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.taskAttempt"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$3"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.this"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.apply")
+      ) ++ Seq(
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.shuffle.FileShuffleBlockResolver$ShuffleFileGroup")
+      ) ++ Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresAggregator.add"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresCostFun.this")
+      )
+    case v if v.startsWith("1.5") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("network"),
+        MimaBuild.excludeSparkPackage("deploy"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in 1.3.
+        excludePackage("org.spark-project.jetty"),
+        MimaBuild.excludeSparkPackage("unused"),
+        // JavaRDDLike is not meant to be extended by user programs
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.partitioner"),
+        // Modification of private static method
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.streaming.kafka.KafkaUtils.org$apache$spark$streaming$kafka$KafkaUtils$$leadersForRanges"),
+        // Mima false positive (was a private[spark] class)
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.util.collection.PairIterator"),
+        // Removing a testing method from a private class
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.kafka.KafkaTestUtils.waitUntilLeaderOffset"),
+        // While private MiMa is still not happy about the changes,
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresAggregator.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.ml.classification.LogisticCostFun.this"),
+        // SQL execution is considered private.
+        excludePackage("org.apache.spark.sql.execution"),
+        // The old JSON RDD is removed in favor of streaming Jackson
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"),
+        // local function inside a method
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
+      ) ++ Seq(
+        // SPARK-8479 Add numNonzeros and numActives to Matrix.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.numNonzeros"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.numActives")
+      ) ++ Seq(
+        // SPARK-8914 Remove RDDApi
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.RDDApi")
+      ) ++ Seq(
+        // SPARK-7292 Provide operator to truncate lineage cheaply
+        ProblemFilters.exclude[AbstractClassProblem](
+          "org.apache.spark.rdd.RDDCheckpointData"),
+        ProblemFilters.exclude[AbstractClassProblem](
+          "org.apache.spark.rdd.CheckpointRDD")
+      ) ++ Seq(
+        // SPARK-8701 Add input metadata in the batch page.
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.streaming.scheduler.InputInfo$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.streaming.scheduler.InputInfo")
+      ) ++ Seq(
+        // SPARK-6797 Support YARN modes for SparkR
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.PairwiseRRDD.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.RRDD.createRWorker"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.RRDD.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.StringRRDD.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.r.BaseRRDD.this")
+      ) ++ Seq(
+        // SPARK-7422 add argmax for sparse vectors
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.argmax")
+      ) ++ Seq(
+        // SPARK-8906 Move all internal data source classes into execution.datasources
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopPartition"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DefaultWriterContainer"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DynamicPartitionWriterContainer"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.BaseWriterContainer"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CaseInsensitiveMap"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException"),
+        // SPARK-9763 Minimize exposure of internal SQL classes
+        excludePackage("org.apache.spark.sql.parquet"),
+        excludePackage("org.apache.spark.sql.json"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$DecimalConversion"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartition$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$JDBCConversion"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$DriverWrapper"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRDD"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCPartitioningInfo"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JdbcUtils"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DefaultSource"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.package$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.JDBCRelation")
+      ) ++ Seq(
+        // SPARK-4751 Dynamic allocation for standalone mode
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkContext.supportDynamicAllocation")
+      ) ++ Seq(
+        // SPARK-9580: Remove SQL test singletons
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.LocalSQLContext$SQLSession"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.LocalSQLContext"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.TestSQLContext"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.test.TestSQLContext$")
+      ) ++ Seq(
+        // SPARK-9704 Made ProbabilisticClassifier, Identifiable, VectorUDT public APIs
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.mllib.linalg.VectorUDT.serialize")
+      ) ++ Seq(
+        // SPARK-10381 Fix types / units in private AskPermissionToCommitOutput RPC message.
+        // This class is marked as `private` but MiMa still seems to be confused by the change.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.task"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$2"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.taskAttempt"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.copy$default$3"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.this"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.scheduler.AskPermissionToCommitOutput.apply")
+      )
 
-        case v if v.startsWith("1.4") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("ml"),
-            // SPARK-7910 Adding a method to get the partioner to JavaRDD,
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitioner"),
-            // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
-            // These are needed if checking against the sbt build, since they are part of
-            // the maven-generated artifacts in 1.3.
-            excludePackage("org.spark-project.jetty"),
-            MimaBuild.excludeSparkPackage("unused"),
-            ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.rdd.JdbcRDD.compute"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.broadcast.HttpBroadcastFactory.newBroadcast"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint")
-          ) ++ Seq(
-            // SPARK-4655 - Making Stage an Abstract class broke binary compatility even though
-            // the stage class is defined as private[spark]
-            ProblemFilters.exclude[AbstractClassProblem]("org.apache.spark.scheduler.Stage")
-          ) ++ Seq(
-            // SPARK-6510 Add a Graph#minus method acting as Set#difference
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
-          ) ++ Seq(
-            // SPARK-6492 Fix deadlock in SparkContext.stop()
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
-                "apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
-          )++ Seq(
-            // SPARK-6693 add tostring with max lines and width for matrix
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.toString")
-          )++ Seq(
-            // SPARK-6703 Add getOrCreate method to SparkContext
-            ProblemFilters.exclude[IncompatibleResultTypeProblem]
-                ("org.apache.spark.SparkContext.org$apache$spark$SparkContext$$activeContext")
-          )++ Seq(
-            // SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.mllib.clustering.LDA$EMOptimizer")
-          ) ++ Seq(
-            // SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.compressed"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.toDense"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.toSparse"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Vector.numActives"),
-            // SPARK-7681 add SparseVector support for gemv
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.multiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
-          ) ++ Seq(
-            // Execution should never be included as its always internal.
-            MimaBuild.excludeSparkPackage("sql.execution"),
-            // This `protected[sql]` method was removed in 1.3.1
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.SQLContext.checkAnalysis"),
-            // These `private[sql]` class were removed in 1.4.0:
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.execution.AddExchange"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.execution.AddExchange$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.PartitionSpec"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.PartitionSpec$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.Partition"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.Partition$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetRelation2$MetadataCache"),
-            // These test support classes were moved out of src/main and into src/test:
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTestData"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTestData$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.TestGroupWriteSupport"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager"),
-            // TODO: Remove the following rule once ParquetTest has been moved to src/test.
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTest")
-          ) ++ Seq(
-            // SPARK-7530 Added StreamingContext.getState()
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.StreamingContext.state_=")
-          ) ++ Seq(
-            // SPARK-7081 changed ShuffleWriter from a trait to an abstract class and removed some
-            // unnecessary type bounds in order to fix some compiler warnings that occurred when
-            // implementing this interface in Java. Note that ShuffleWriter is private[spark].
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.shuffle.ShuffleWriter")
-          ) ++ Seq(
-            // SPARK-6888 make jdbc driver handling user definable
-            // This patch renames some classes to API friendly names.
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.PostgresQuirks"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.NoQuirks"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.MySQLQuirks")
-          )
+    case v if v.startsWith("1.4") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("ml"),
+        // SPARK-7910 Adding a method to get the partioner to JavaRDD,
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitioner"),
+        // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in 1.3.
+        excludePackage("org.spark-project.jetty"),
+        MimaBuild.excludeSparkPackage("unused"),
+        ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.rdd.JdbcRDD.compute"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.broadcast.HttpBroadcastFactory.newBroadcast"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint")
+      ) ++ Seq(
+        // SPARK-4655 - Making Stage an Abstract class broke binary compatility even though
+        // the stage class is defined as private[spark]
+        ProblemFilters.exclude[AbstractClassProblem]("org.apache.spark.scheduler.Stage")
+      ) ++ Seq(
+        // SPARK-6510 Add a Graph#minus method acting as Set#difference
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
+      ) ++ Seq(
+        // SPARK-6492 Fix deadlock in SparkContext.stop()
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
+            "apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
+      )++ Seq(
+        // SPARK-6693 add tostring with max lines and width for matrix
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.toString")
+      )++ Seq(
+        // SPARK-6703 Add getOrCreate method to SparkContext
+        ProblemFilters.exclude[IncompatibleResultTypeProblem]
+            ("org.apache.spark.SparkContext.org$apache$spark$SparkContext$$activeContext")
+      )++ Seq(
+        // SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.mllib.clustering.LDA$EMOptimizer")
+      ) ++ Seq(
+        // SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.compressed"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.toDense"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.toSparse"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Vector.numActives"),
+        // SPARK-7681 add SparseVector support for gemv
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.multiply"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.DenseMatrix.multiply"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.SparseMatrix.multiply")
+      ) ++ Seq(
+        // Execution should never be included as its always internal.
+        MimaBuild.excludeSparkPackage("sql.execution"),
+        // This `protected[sql]` method was removed in 1.3.1
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.checkAnalysis"),
+        // These `private[sql]` class were removed in 1.4.0:
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.execution.AddExchange"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.execution.AddExchange$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.PartitionSpec"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.PartitionSpec$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.Partition"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.Partition$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetRelation2$MetadataCache"),
+        // These test support classes were moved out of src/main and into src/test:
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetTestData"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetTestData$"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.TestGroupWriteSupport"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager"),
+        // TODO: Remove the following rule once ParquetTest has been moved to src/test.
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.parquet.ParquetTest")
+      ) ++ Seq(
+        // SPARK-7530 Added StreamingContext.getState()
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.StreamingContext.state_=")
+      ) ++ Seq(
+        // SPARK-7081 changed ShuffleWriter from a trait to an abstract class and removed some
+        // unnecessary type bounds in order to fix some compiler warnings that occurred when
+        // implementing this interface in Java. Note that ShuffleWriter is private[spark].
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.shuffle.ShuffleWriter")
+      ) ++ Seq(
+        // SPARK-6888 make jdbc driver handling user definable
+        // This patch renames some classes to API friendly names.
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.DriverQuirks"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.PostgresQuirks"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.NoQuirks"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.jdbc.MySQLQuirks")
+      )
 
-        case v if v.startsWith("1.3") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("ml"),
-            // These are needed if checking against the sbt build, since they are part of
-            // the maven-generated artifacts in the 1.2 build.
-            MimaBuild.excludeSparkPackage("unused"),
-            ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional")
-          ) ++ Seq(
-            // SPARK-2321
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.SparkStageInfoImpl.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.SparkStageInfo.submissionTime")
-          ) ++ Seq(
-            // SPARK-4614
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrices.randn"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrices.rand")
-          ) ++ Seq(
-            // SPARK-5321
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.transpose"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." +
-                "org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.isTransposed"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.linalg.Matrix.foreachActive")
-          ) ++ Seq(
-            // SPARK-5540
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.solveLeastSquares"),
-            // SPARK-5536
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateBlock")
-          ) ++ Seq(
-            // SPARK-3325
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
-            // SPARK-2757
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
-                "removeAndGetProcessor")
-          ) ++ Seq(
-            // SPARK-5123 (SparkSQL data type change) - alpha component only
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.ml.feature.HashingTF.outputDataType"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.ml.feature.Tokenizer.outputDataType"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.ml.feature.Tokenizer.validateInputType"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
-          ) ++ Seq(
-            // SPARK-4014
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.TaskContext.taskAttemptId"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.TaskContext.attemptNumber")
-          ) ++ Seq(
-            // SPARK-5166 Spark SQL API stabilization
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
-          ) ++ Seq(
-            // SPARK-5270
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.isEmpty")
-          ) ++ Seq(
-            // SPARK-5430
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.treeReduce"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.treeAggregate")
-          ) ++ Seq(
-            // SPARK-5297 Java FileStream do not work with custom key/values
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
-          ) ++ Seq(
-            // SPARK-5315 Spark Streaming Java API returns Scala DStream
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
-          ) ++ Seq(
-            // SPARK-5461 Graph should have isCheckpointed, getCheckpointFiles methods
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.graphx.Graph.getCheckpointFiles"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.graphx.Graph.isCheckpointed")
-          ) ++ Seq(
-            // SPARK-4789 Standardize ML Prediction APIs
-            ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
-          ) ++ Seq(
-            // SPARK-5814
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$wrapDoubleArray"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$fillFullMatrix"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$iterations"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeOutLinkBlock"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$computeYtY"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeLinkRDDs"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$alpha"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$randomFactor"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeInLinkBlock"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$dspr"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$lambda"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$implicitPrefs"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$rank")
-          ) ++ Seq(
-            // SPARK-4682
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
-          ) ++ Seq(
-            // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff")
-          )
+    case v if v.startsWith("1.3") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("ml"),
+        // These are needed if checking against the sbt build, since they are part of
+        // the maven-generated artifacts in the 1.2 build.
+        MimaBuild.excludeSparkPackage("unused"),
+        ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional")
+      ) ++ Seq(
+        // SPARK-2321
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkStageInfoImpl.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkStageInfo.submissionTime")
+      ) ++ Seq(
+        // SPARK-4614
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrices.randn"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrices.rand")
+      ) ++ Seq(
+        // SPARK-5321
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.transpose"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." +
+            "org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.isTransposed"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.linalg.Matrix.foreachActive")
+      ) ++ Seq(
+        // SPARK-5540
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.solveLeastSquares"),
+        // SPARK-5536
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateBlock")
+      ) ++ Seq(
+        // SPARK-3325
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
+        // SPARK-2757
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
+            "removeAndGetProcessor")
+      ) ++ Seq(
+        // SPARK-5123 (SparkSQL data type change) - alpha component only
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.ml.feature.HashingTF.outputDataType"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.ml.feature.Tokenizer.outputDataType"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.ml.feature.Tokenizer.validateInputType"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
+      ) ++ Seq(
+        // SPARK-4014
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.TaskContext.taskAttemptId"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.TaskContext.attemptNumber")
+      ) ++ Seq(
+        // SPARK-5166 Spark SQL API stabilization
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
+      ) ++ Seq(
+        // SPARK-5270
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.isEmpty")
+      ) ++ Seq(
+        // SPARK-5430
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.treeReduce"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.treeAggregate")
+      ) ++ Seq(
+        // SPARK-5297 Java FileStream do not work with custom key/values
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
+      ) ++ Seq(
+        // SPARK-5315 Spark Streaming Java API returns Scala DStream
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
+      ) ++ Seq(
+        // SPARK-5461 Graph should have isCheckpointed, getCheckpointFiles methods
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.graphx.Graph.getCheckpointFiles"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.graphx.Graph.isCheckpointed")
+      ) ++ Seq(
+        // SPARK-4789 Standardize ML Prediction APIs
+        ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
+      ) ++ Seq(
+        // SPARK-5814
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$wrapDoubleArray"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$fillFullMatrix"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$iterations"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeOutLinkBlock"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$computeYtY"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeLinkRDDs"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$alpha"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$randomFactor"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeInLinkBlock"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$dspr"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$lambda"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$implicitPrefs"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$rank")
+      ) ++ Seq(
+        // SPARK-4682
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
+      ) ++ Seq(
+        // SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff")
+      )
 
-        case v if v.startsWith("1.2") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("graphx")
-          ) ++
-          MimaBuild.excludeSparkClass("mllib.linalg.Matrix") ++
-          MimaBuild.excludeSparkClass("mllib.linalg.Vector") ++
-          Seq(
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.scheduler.TaskLocation"),
-            // Added normL1 and normL2 to trait MultivariateStatisticalSummary
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL2"),
-            // MapStatus should be private[spark]
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.scheduler.MapStatus"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.network.netty.PathResolver"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.network.netty.client.BlockClientListener"),
+    case v if v.startsWith("1.2") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("graphx")
+      ) ++
+      MimaBuild.excludeSparkClass("mllib.linalg.Matrix") ++
+      MimaBuild.excludeSparkClass("mllib.linalg.Vector") ++
+      Seq(
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.scheduler.TaskLocation"),
+        // Added normL1 and normL2 to trait MultivariateStatisticalSummary
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL2"),
+        // MapStatus should be private[spark]
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.scheduler.MapStatus"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.network.netty.PathResolver"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.network.netty.client.BlockClientListener"),
 
-            // TaskContext was promoted to Abstract class
-            ProblemFilters.exclude[AbstractClassProblem](
-              "org.apache.spark.TaskContext"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.util.collection.SortDataFormat")
-          ) ++ Seq(
-            // Adding new methods to the JavaRDDLike trait:
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.takeAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachPartitionAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.countAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.collectAsync")
-          ) ++ Seq(
-            // SPARK-3822
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.SparkContext.org$apache$spark$SparkContext$$createTaskScheduler")
-          ) ++ Seq(
-            // SPARK-1209
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.hadoop.mapred.SparkHadoopMapRedUtil"),
-            ProblemFilters.exclude[MissingTypesProblem](
-              "org.apache.spark.rdd.PairRDDFunctions")
-          ) ++ Seq(
-            // SPARK-4062
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.kafka.KafkaReceiver#MessageHandler.this")
-          )
+        // TaskContext was promoted to Abstract class
+        ProblemFilters.exclude[AbstractClassProblem](
+          "org.apache.spark.TaskContext"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.util.collection.SortDataFormat")
+      ) ++ Seq(
+        // Adding new methods to the JavaRDDLike trait:
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.takeAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.foreachPartitionAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.countAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.collectAsync")
+      ) ++ Seq(
+        // SPARK-3822
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.SparkContext.org$apache$spark$SparkContext$$createTaskScheduler")
+      ) ++ Seq(
+        // SPARK-1209
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.hadoop.mapred.SparkHadoopMapRedUtil"),
+        ProblemFilters.exclude[MissingTypesProblem](
+          "org.apache.spark.rdd.PairRDDFunctions")
+      ) ++ Seq(
+        // SPARK-4062
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.kafka.KafkaReceiver#MessageHandler.this")
+      )
 
-        case v if v.startsWith("1.1") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("deploy"),
-            MimaBuild.excludeSparkPackage("graphx")
-          ) ++
-          Seq(
-            // Adding new method to JavaRDLike trait - we should probably mark this as a developer API.
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitions"),
-            // Should probably mark this as Experimental
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
-            // We made a mistake earlier (ed06500d3) in the Java API to use default parameter values
-            // for countApproxDistinct* functions, which does not work in Java. We later removed
-            // them, and use the following to tell Mima to not care about them.
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
-            ProblemFilters.exclude[IncompatibleResultTypeProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDD.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.DiskStore.getValues"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.MemoryStore.Entry")
-          ) ++
-          Seq(
-            // Serializer interface change. See SPARK-3045.
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.DeserializationStream"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.Serializer"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.SerializationStream"),
-            ProblemFilters.exclude[IncompatibleTemplateDefProblem](
-              "org.apache.spark.serializer.SerializerInstance")
-          )++
-          Seq(
-            // Renamed putValues -> putArray + putIterator
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.MemoryStore.putValues"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.DiskStore.putValues"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.storage.TachyonStore.putValues")
-          ) ++
-          Seq(
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.streaming.flume.FlumeReceiver.this"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.streaming.kafka.KafkaReceiver.this")
-          ) ++
-          Seq( // Ignore some private methods in ALS.
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
-            ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
-              "org.apache.spark.mllib.recommendation.ALS.this"),
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
-          ) ++
-          MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
-          MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
-          MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
-          MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
-          MimaBuild.excludeSparkClass("storage.Values") ++
-          MimaBuild.excludeSparkClass("storage.Entry") ++
-          MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
-          // Class was missing "@DeveloperApi" annotation in 1.0.
-          MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
-          Seq(
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.tree.impurity.Gini.calculate"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem](
-              "org.apache.spark.mllib.tree.impurity.Variance.calculate")
-          ) ++
-          Seq( // Package-private classes removed in SPARK-2341
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
-            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
-          ) ++
-          Seq( // package-private classes removed in MLlib
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.org$apache$spark$mllib$regression$GeneralizedLinearAlgorithm$$prependOne")
-          ) ++
-          Seq( // new Vector methods in MLlib (binary compatible assuming users do not implement Vector)
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.copy")
-          ) ++
-          Seq( // synthetic methods generated in LabeledPoint
-            ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.regression.LabeledPoint$"),
-            ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.regression.LabeledPoint.apply"),
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.regression.LabeledPoint.toString")
-          ) ++
-          Seq ( // Scala 2.11 compatibility fix
-            ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.StreamingContext.<init>$default$2")
-          )
-        case v if v.startsWith("1.0") =>
-          Seq(
-            MimaBuild.excludeSparkPackage("api.java"),
-            MimaBuild.excludeSparkPackage("mllib"),
-            MimaBuild.excludeSparkPackage("streaming")
-          ) ++
-          MimaBuild.excludeSparkClass("rdd.ClassTags") ++
-          MimaBuild.excludeSparkClass("util.XORShiftRandom") ++
-          MimaBuild.excludeSparkClass("graphx.EdgeRDD") ++
-          MimaBuild.excludeSparkClass("graphx.VertexRDD") ++
-          MimaBuild.excludeSparkClass("graphx.impl.GraphImpl") ++
-          MimaBuild.excludeSparkClass("graphx.impl.RoutingTable") ++
-          MimaBuild.excludeSparkClass("graphx.util.collection.PrimitiveKeyOpenHashMap") ++
-          MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap") ++
-          MimaBuild.excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
-          MimaBuild.excludeSparkClass("mllib.optimization.SquaredGradient") ++
-          MimaBuild.excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
-          MimaBuild.excludeSparkClass("mllib.regression.LassoWithSGD") ++
-          MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
-        case _ => Seq()
-      }
-}
+    case v if v.startsWith("1.1") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("deploy"),
+        MimaBuild.excludeSparkPackage("graphx")
+      ) ++
+      Seq(
+        // Adding new method to JavaRDLike trait - we should probably mark this as a developer API.
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitions"),
+        // Should probably mark this as Experimental
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
+        // We made a mistake earlier (ed06500d3) in the Java API to use default parameter values
+        // for countApproxDistinct* functions, which does not work in Java. We later removed
+        // them, and use the following to tell Mima to not care about them.
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
+        ProblemFilters.exclude[IncompatibleResultTypeProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDD.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaRDDLike.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.DiskStore.getValues"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.MemoryStore.Entry")
+      ) ++
+      Seq(
+        // Serializer interface change. See SPARK-3045.
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.DeserializationStream"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.Serializer"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.SerializationStream"),
+        ProblemFilters.exclude[IncompatibleTemplateDefProblem](
+          "org.apache.spark.serializer.SerializerInstance")
+      )++
+      Seq(
+        // Renamed putValues -> putArray + putIterator
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.MemoryStore.putValues"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.DiskStore.putValues"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.storage.TachyonStore.putValues")
+      ) ++
+      Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.streaming.flume.FlumeReceiver.this"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.streaming.kafka.KafkaReceiver.this")
+      ) ++
+      Seq( // Ignore some private methods in ALS.
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
+        ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
+          "org.apache.spark.mllib.recommendation.ALS.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
+      ) ++
+      MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
+      MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
+      MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
+      MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
+      MimaBuild.excludeSparkClass("storage.Values") ++
+      MimaBuild.excludeSparkClass("storage.Entry") ++
+      MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
+      // Class was missing "@DeveloperApi" annotation in 1.0.
+      MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
+      Seq(
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.tree.impurity.Gini.calculate"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem](
+          "org.apache.spark.mllib.tree.impurity.Variance.calculate")
+      ) ++
+      Seq( // Package-private classes removed in SPARK-2341
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
+        ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
+      ) ++
+      Seq( // package-private classes removed in MLlib
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.org$apache$spark$mllib$regression$GeneralizedLinearAlgorithm$$prependOne")
+      ) ++
+      Seq( // new Vector methods in MLlib (binary compatible assuming users do not implement Vector)
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.copy")
+      ) ++
+      Seq( // synthetic methods generated in LabeledPoint
+        ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.regression.LabeledPoint$"),
+        ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.regression.LabeledPoint.apply"),
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.regression.LabeledPoint.toString")
+      ) ++
+      Seq ( // Scala 2.11 compatibility fix
+        ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.StreamingContext.<init>$default$2")
+      )
+    case v if v.startsWith("1.0") =>
+      Seq(
+        MimaBuild.excludeSparkPackage("api.java"),
+        MimaBuild.excludeSparkPackage("mllib"),
+        MimaBuild.excludeSparkPackage("streaming")
+      ) ++
+      MimaBuild.excludeSparkClass("rdd.ClassTags") ++
+      MimaBuild.excludeSparkClass("util.XORShiftRandom") ++
+      MimaBuild.excludeSparkClass("graphx.EdgeRDD") ++
+      MimaBuild.excludeSparkClass("graphx.VertexRDD") ++
+      MimaBuild.excludeSparkClass("graphx.impl.GraphImpl") ++
+      MimaBuild.excludeSparkClass("graphx.impl.RoutingTable") ++
+      MimaBuild.excludeSparkClass("graphx.util.collection.PrimitiveKeyOpenHashMap") ++
+      MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap") ++
+      MimaBuild.excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
+      MimaBuild.excludeSparkClass("mllib.optimization.SquaredGradient") ++
+      MimaBuild.excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
+      MimaBuild.excludeSparkClass("mllib.regression.LassoWithSGD") ++
+      MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
+    case _ => Seq()
+  }
+}
\ No newline at end of file
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 51820460ca1a0..c06687d8f197b 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,5 +1,3 @@
-scalaVersion := "2.10.4"
-
 resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 
 resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
diff --git a/python/docs/_static/pyspark.css b/python/docs/_static/pyspark.css
new file mode 100644
index 0000000000000..41106f2f6e26d
--- /dev/null
+++ b/python/docs/_static/pyspark.css
@@ -0,0 +1,90 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+body {
+    background-color: #ffffff;
+}
+
+div.sphinxsidebar {
+    width: 274px;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 274px;
+}
+
+div.sphinxsidebar ul {
+    margin-right: 10px;
+}
+
+div.sphinxsidebar li a {
+    word-break: break-all;
+}
+
+span.pys-tag {
+    font-size: 11px;
+    font-weight: bold;
+    margin: 0 0 0 2px;
+    padding: 1px 3px 1px 3px;
+    -moz-border-radius: 3px;
+    -webkit-border-radius: 3px;
+    border-radius: 3px;
+    text-align: center;
+    text-decoration: none;
+}
+
+span.pys-tag-experimental {
+    background-color: rgb(37, 112, 128);
+    color: rgb(255, 255, 255);
+}
+
+span.pys-tag-deprecated {
+    background-color: rgb(238, 238, 238);
+    color: rgb(62, 67, 73);
+}
+
+div.pys-note-experimental {
+    background-color: rgb(88, 151, 165);
+    border-color: rgb(59, 115, 127);
+    color: rgb(255, 255, 255);
+}
+
+div.pys-note-deprecated {
+}
+
+.hasTooltip {
+    position:relative;
+}
+.hasTooltip span {
+    display:none;
+}
+
+.hasTooltip:hover span.tooltip {
+    display: inline-block;
+    -moz-border-radius: 2px;
+    -webkit-border-radius: 2px;
+    border-radius: 2px;
+    background-color: rgb(250, 250, 250);
+    color: rgb(68, 68, 68);
+    font-weight: normal;
+    box-shadow: 1px 1px 3px rgb(127, 127, 127);
+    position: absolute;
+    padding: 0 3px 0 3px;
+    top: 1.3em;
+    left: 14px;
+    z-index: 9999
+}
diff --git a/python/docs/_static/pyspark.js b/python/docs/_static/pyspark.js
new file mode 100644
index 0000000000000..75e4c42492a48
--- /dev/null
+++ b/python/docs/_static/pyspark.js
@@ -0,0 +1,99 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+$(function (){
+
+    function startsWith(s, prefix) {
+        return s && s.indexOf(prefix) === 0;
+    }
+
+    function buildSidebarLinkMap() {
+        var linkMap = {};
+        $('div.sphinxsidebar a.reference.internal').each(function (i,a)  {
+            var href = $(a).attr('href');
+            if (startsWith(href, '#module-')) {
+                var id = href.substr(8);
+                linkMap[id] = [$(a), null];
+            }
+        })
+        return linkMap;
+    };
+
+    function getAdNoteDivs(dd) {
+        var noteDivs = {};
+        dd.find('> div.admonition.note > p.last').each(function (i, p) {
+            var text = $(p).text();
+            if (!noteDivs.experimental && startsWith(text, 'Experimental')) {
+                noteDivs.experimental = $(p).parent();
+            }
+            if (!noteDivs.deprecated && startsWith(text, 'Deprecated')) {
+                noteDivs.deprecated = $(p).parent();
+            }
+        });
+        return noteDivs;
+    }
+
+    function getParentId(name) {
+        var last_idx = name.lastIndexOf('.');
+        return last_idx == -1? '': name.substr(0, last_idx);
+    }
+
+    function buildTag(text, cls, tooltip) {
+        return '<span class="pys-tag ' + cls + ' hasTooltip">' + text + '<span class="tooltip">'
+            + tooltip + '</span></span>'
+    }
+
+
+    var sidebarLinkMap = buildSidebarLinkMap();
+
+    $('dl.class, dl.function').each(function (i,dl)  {
+
+        dl = $(dl);
+        dt = dl.children('dt').eq(0);
+        dd = dl.children('dd').eq(0);
+        var id = dt.attr('id');
+        var desc = dt.find('> .descname').text();
+        var adNoteDivs = getAdNoteDivs(dd);
+
+        if (id) {
+            var parent_id = getParentId(id);
+
+            var r = sidebarLinkMap[parent_id];
+            if (r) {
+                if (r[1] === null) {
+                    r[1] = $('<ul/>');
+                    r[0].parent().append(r[1]);
+                }
+                var tags = '';
+                if (adNoteDivs.experimental) {
+                    tags += buildTag('E', 'pys-tag-experimental', 'Experimental');
+                    adNoteDivs.experimental.addClass('pys-note pys-note-experimental');
+                }
+                if (adNoteDivs.deprecated) {
+                    tags += buildTag('D', 'pys-tag-deprecated', 'Deprecated');
+                    adNoteDivs.deprecated.addClass('pys-note pys-note-deprecated');
+                }
+                var li = $('<li/>');
+                var a = $('<a href="#' + id + '">' + desc + '</a>');
+                li.append(a);
+                li.append(tags);
+                r[1].append(li);
+                sidebarLinkMap[id] = [a, null];
+            }
+        }
+    });
+});
diff --git a/python/docs/_templates/layout.html b/python/docs/_templates/layout.html
new file mode 100644
index 0000000000000..ab36ebababf88
--- /dev/null
+++ b/python/docs/_templates/layout.html
@@ -0,0 +1,6 @@
+{% extends "!layout.html" %}
+{% set script_files = script_files + ["_static/pyspark.js"] %}
+{% set css_files = css_files + ['_static/pyspark.css'] %}
+{% block rootrellink %}
+    {{ super() }}
+{% endblock %}
diff --git a/python/docs/conf.py b/python/docs/conf.py
index 163987dd8e5fa..365d6af514177 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -23,7 +23,7 @@
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+needs_sphinx = '1.2'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -135,7 +135,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
+html_static_path = ['_static']
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
diff --git a/python/docs/index.rst b/python/docs/index.rst
index f7eede9c3c82a..306ffdb0e0f13 100644
--- a/python/docs/index.rst
+++ b/python/docs/index.rst
@@ -29,6 +29,14 @@ Core classes:
 
     A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
 
+    :class:`pyspark.streaming.StreamingContext`
+
+    Main entry point for Spark Streaming functionality.
+
+    :class:`pyspark.streaming.DStream`
+
+    A Discretized Stream (DStream), the basic abstraction in Spark Streaming.
+
     :class:`pyspark.sql.SQLContext`
 
     Main entry point for DataFrame and SQL functionality.
diff --git a/python/docs/pyspark.streaming.rst b/python/docs/pyspark.streaming.rst
index 50822c93faba1..fc52a647543e7 100644
--- a/python/docs/pyspark.streaming.rst
+++ b/python/docs/pyspark.streaming.rst
@@ -15,3 +15,24 @@ pyspark.streaming.kafka module
     :members:
     :undoc-members:
     :show-inheritance:
+
+pyspark.streaming.kinesis module
+--------------------------------
+.. automodule:: pyspark.streaming.kinesis
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.streaming.flume.module
+------------------------------
+.. automodule:: pyspark.streaming.flume
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.streaming.mqtt module
+-----------------------------
+.. automodule:: pyspark.streaming.mqtt
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 5f70ac6ed8fe6..8475dfb1c6ad0 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -48,6 +48,22 @@
 from pyspark.status import *
 from pyspark.profiler import Profiler, BasicProfiler
 
+
+def since(version):
+    """
+    A decorator that annotates a function to append the version of Spark the function was added.
+    """
+    import re
+    indent_p = re.compile(r'\n( +)')
+
+    def deco(f):
+        indents = indent_p.findall(f.__doc__)
+        indent = ' ' * (min(len(m) for m in indents) if indents else 0)
+        f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
+        return f
+    return deco
+
+
 # for back compatibility
 from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row
 
diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index 3b647985801b7..95b3abc74244b 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -350,6 +350,11 @@ def save_global(self, obj, name=None, pack=struct.pack):
             if new_override:
                 d['__new__'] = obj.__new__
 
+            # workaround for namedtuple (hijacked by PySpark)
+            if getattr(obj, '_is_namedtuple_', False):
+                self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields))
+                return
+
             self.save(_load_class)
             self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj)
             d.pop('__doc__', None)
@@ -382,7 +387,7 @@ def save_instancemethod(self, obj):
             self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
         else:
             self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__),
-                         obj=obj)
+                             obj=obj)
     dispatch[types.MethodType] = save_instancemethod
 
     def save_inst(self, obj):
@@ -744,6 +749,14 @@ def _load_class(cls, d):
     return cls
 
 
+def _load_namedtuple(name, fields):
+    """
+    Loads a class generated by namedtuple
+    """
+    from collections import namedtuple
+    return namedtuple(name, fields)
+
+
 """Constructors for 3rd party libraries
 Note: These can never be renamed due to client compatibility issues"""
 
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 1b2a52ad64114..a0a1ccbeefb09 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -255,7 +255,7 @@ def __getnewargs__(self):
         # This method is called when attempting to pickle SparkContext, which is always an error:
         raise Exception(
             "It appears that you are attempting to reference SparkContext from a broadcast "
-            "variable, action, or transforamtion. SparkContext can only be used on the driver, "
+            "variable, action, or transformation. SparkContext can only be used on the driver, "
             "not in code that it run on workers. For more information, see SPARK-5063."
         )
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 83f808efc3bf0..88815e561f572 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -26,12 +26,14 @@
 __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
            'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
            'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
-           'NaiveBayesModel']
+           'NaiveBayesModel', 'MultilayerPerceptronClassifier',
+           'MultilayerPerceptronClassificationModel']
 
 
 @inherit_doc
 class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
-                         HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol):
+                         HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol,
+                         HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds):
     """
     Logistic regression.
     Currently, this class only supports binary classification.
@@ -65,17 +67,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     """
 
     # a placeholder to make it appear in the generated doc
-    elasticNetParam = \
-        Param(Params._dummy(), "elasticNetParam",
-              "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-              "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
-    thresholds = Param(Params._dummy(), "thresholds",
-                       "Thresholds in multi-class classification" +
-                       " to adjust the probability of predicting each class." +
-                       " Array must have length equal to the number of classes, with values >= 0." +
-                       " The class with largest value p/t is predicted, where p is the original" +
-                       " probability of that class and t is the class' threshold.")
     threshold = Param(Params._dummy(), "threshold",
                       "Threshold in binary classification prediction, in range [0, 1]." +
                       " If threshold and thresholds are both set, they must match.")
@@ -83,40 +74,23 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                 threshold=0.5, thresholds=None,
-                 probabilityCol="probability", rawPredictionCol="rawPrediction"):
+                 threshold=0.5, thresholds=None, probabilityCol="probability",
+                 rawPredictionCol="rawPrediction", standardization=True):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                 threshold=0.5, thresholds=None, \
-                 probabilityCol="probability", rawPredictionCol="rawPrediction")
+                 threshold=0.5, thresholds=None, probabilityCol="probability", \
+                 rawPredictionCol="rawPrediction", standardization=True)
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
         super(LogisticRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LogisticRegression", self.uid)
-        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
-        #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
-        self.elasticNetParam = \
-            Param(self, "elasticNetParam",
-                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-                  "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-        #: param for whether to fit an intercept term.
-        self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
         #: param for threshold in binary classification, in range [0, 1].
         self.threshold = Param(self, "threshold",
                                "Threshold in binary classification prediction, in range [0, 1]." +
                                " If threshold and thresholds are both set, they must match.")
-        #: param for thresholds or cutoffs in binary or multiclass classification
-        self.thresholds = \
-            Param(self, "thresholds",
-                  "Thresholds in multi-class classification" +
-                  " to adjust the probability of predicting each class." +
-                  " Array must have length equal to the number of classes, with values >= 0." +
-                  " The class with largest value p/t is predicted, where p is the original" +
-                  " probability of that class and t is the class' threshold.")
-        self._setDefault(maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1E-6,
-                         fitIntercept=True, threshold=0.5)
+        self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
         self._checkThresholdConsistency()
@@ -124,13 +98,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                  threshold=0.5, thresholds=None,
-                  probabilityCol="probability", rawPredictionCol="rawPrediction"):
+                  threshold=0.5, thresholds=None, probabilityCol="probability",
+                  rawPredictionCol="rawPrediction", standardization=True):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                  threshold=0.5, thresholds=None, \
-                  probabilityCol="probability", rawPredictionCol="rawPrediction")
+                  threshold=0.5, thresholds=None, probabilityCol="probability", \
+                  rawPredictionCol="rawPrediction", standardization=True)
         Sets params for logistic regression.
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
@@ -142,32 +116,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LogisticRegressionModel(java_model)
 
-    def setElasticNetParam(self, value):
-        """
-        Sets the value of :py:attr:`elasticNetParam`.
-        """
-        self._paramMap[self.elasticNetParam] = value
-        return self
-
-    def getElasticNetParam(self):
-        """
-        Gets the value of elasticNetParam or its default value.
-        """
-        return self.getOrDefault(self.elasticNetParam)
-
-    def setFitIntercept(self, value):
-        """
-        Sets the value of :py:attr:`fitIntercept`.
-        """
-        self._paramMap[self.fitIntercept] = value
-        return self
-
-    def getFitIntercept(self):
-        """
-        Gets the value of fitIntercept or its default value.
-        """
-        return self.getOrDefault(self.fitIntercept)
-
     def setThreshold(self, value):
         """
         Sets the value of :py:attr:`threshold`.
@@ -808,6 +756,135 @@ def theta(self):
         return self._call_java("theta")
 
 
+@inherit_doc
+class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+                                     HasMaxIter, HasTol, HasSeed):
+    """
+    Classifier trainer based on the Multilayer Perceptron.
+    Each layer has sigmoid activation function, output layer has softmax.
+    Number of inputs has to be equal to the size of feature vectors.
+    Number of outputs has to be equal to the total number of labels.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (0.0, Vectors.dense([0.0, 0.0])),
+    ...     (1.0, Vectors.dense([0.0, 1.0])),
+    ...     (1.0, Vectors.dense([1.0, 0.0])),
+    ...     (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
+    >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11)
+    >>> model = mlp.fit(df)
+    >>> model.layers
+    [2, 5, 2]
+    >>> model.weights.size
+    27
+    >>> testDF = sqlContext.createDataFrame([
+    ...     (Vectors.dense([1.0, 0.0]),),
+    ...     (Vectors.dense([0.0, 0.0]),)], ["features"])
+    >>> model.transform(testDF).show()
+    +---------+----------+
+    | features|prediction|
+    +---------+----------+
+    |[1.0,0.0]|       1.0|
+    |[0.0,0.0]|       0.0|
+    +---------+----------+
+    ...
+    """
+
+    # a placeholder to make it appear in the generated doc
+    layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
+                   "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
+                   "neurons and output layer of 10 neurons, default is [1, 1].")
+    blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
+                      "matrices. Data is stacked within partitions. If block size is more than " +
+                      "remaining data in a partition then it is adjusted to the size of this " +
+                      "data. Recommended size is between 10 and 1000, default is 128.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+        """
+        super(MultilayerPerceptronClassifier, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
+        self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
+                            "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
+                            "100 neurons and output layer of 10 neurons, default is [1, 1].")
+        self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
+                               "matrices. Data is stacked within partitions. If block size is " +
+                               "more than remaining data in a partition then it is adjusted to " +
+                               "the size of this data. Recommended size is between 10 and 1000, " +
+                               "default is 128.")
+        self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+        Sets params for MultilayerPerceptronClassifier.
+        """
+        kwargs = self.setParams._input_kwargs
+        if layers is None:
+            return self._set(**kwargs).setLayers([1, 1])
+        else:
+            return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return MultilayerPerceptronClassificationModel(java_model)
+
+    def setLayers(self, value):
+        """
+        Sets the value of :py:attr:`layers`.
+        """
+        self._paramMap[self.layers] = value
+        return self
+
+    def getLayers(self):
+        """
+        Gets the value of layers or its default value.
+        """
+        return self.getOrDefault(self.layers)
+
+    def setBlockSize(self, value):
+        """
+        Sets the value of :py:attr:`blockSize`.
+        """
+        self._paramMap[self.blockSize] = value
+        return self
+
+    def getBlockSize(self):
+        """
+        Gets the value of blockSize or its default value.
+        """
+        return self.getOrDefault(self.blockSize)
+
+
+class MultilayerPerceptronClassificationModel(JavaModel):
+    """
+    Model fitted by MultilayerPerceptronClassifier.
+    """
+
+    @property
+    def layers(self):
+        """
+        array of layer sizes including input and output layers.
+        """
+        return self._call_java("javaLayers")
+
+    @property
+    def weights(self):
+        """
+        vector of initial weights for the model that consists of the weights of layers.
+        """
+        return self._call_java("weights")
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index cb4c16e25a7a3..7bb8ab94e17df 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -26,8 +27,11 @@
 class KMeansModel(JavaModel):
     """
     Model fitted by KMeans.
+
+    .. versionadded:: 1.5.0
     """
 
+    @since("1.5.0")
     def clusterCenters(self):
         """Get the cluster centers, represented as a list of NumPy arrays."""
         return [c.toArray() for c in self._call_java("clusterCenters")]
@@ -55,6 +59,8 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     True
     >>> rows[2].prediction == rows[3].prediction
     True
+
+    .. versionadded:: 1.5.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -88,6 +94,7 @@ def _create_model(self, java_model):
         return KMeansModel(java_model)
 
     @keyword_only
+    @since("1.5.0")
     def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
                   initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None):
         """
@@ -99,6 +106,7 @@ def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.5.0")
     def setK(self, value):
         """
         Sets the value of :py:attr:`k`.
@@ -110,12 +118,14 @@ def setK(self, value):
         self._paramMap[self.k] = value
         return self
 
+    @since("1.5.0")
     def getK(self):
         """
         Gets the value of `k`
         """
         return self.getOrDefault(self.k)
 
+    @since("1.5.0")
     def setInitMode(self, value):
         """
         Sets the value of :py:attr:`initMode`.
@@ -130,12 +140,14 @@ def setInitMode(self, value):
         self._paramMap[self.initMode] = value
         return self
 
+    @since("1.5.0")
     def getInitMode(self):
         """
         Gets the value of `initMode`
         """
         return self.getOrDefault(self.initMode)
 
+    @since("1.5.0")
     def setInitSteps(self, value):
         """
         Sets the value of :py:attr:`initSteps`.
@@ -147,6 +159,7 @@ def setInitSteps(self, value):
         self._paramMap[self.initSteps] = value
         return self
 
+    @since("1.5.0")
     def getInitSteps(self):
         """
         Gets the value of `initSteps`
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 04b2b2ccc9e55..a4e60f916b5c8 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -22,20 +22,24 @@
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import keyword_only
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
-           'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
-           'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
-           'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
-           'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT',
+           'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler',
+           'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
+           'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
+           'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
+           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
+           'Word2Vec', 'Word2VecModel']
 
 
 @inherit_doc
 class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Binarize a column of continuous features given a threshold.
 
     >>> df = sqlContext.createDataFrame([(0.5,)], ["values"])
@@ -92,6 +96,8 @@ def getThreshold(self):
 @inherit_doc
 class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Maps a column of continuous features to a column of feature buckets.
 
     >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
@@ -166,9 +172,211 @@ def getSplits(self):
         return self.getOrDefault(self.splits)
 
 
+@inherit_doc
+class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.
+    >>> df = sqlContext.createDataFrame(
+    ...    [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
+    ...    ["label", "raw"])
+    >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
+    >>> model = cv.fit(df)
+    >>> model.transform(df).show(truncate=False)
+    +-----+---------------+-------------------------+
+    |label|raw            |vectors                  |
+    +-----+---------------+-------------------------+
+    |0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
+    |1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+    +-----+---------------+-------------------------+
+    ...
+    >>> sorted(map(str, model.vocabulary))
+    ['a', 'b', 'c']
+    """
+
+    # a placeholder to make it appear in the generated doc
+    minTF = Param(
+        Params._dummy(), "minTF", "Filter to ignore rare words in" +
+        " a document. For each document, terms with frequency/count less than the given" +
+        " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
+        " times the term must appear in the document); if this is a double in [0,1), then this " +
+        "specifies a fraction (out of the document's token count). Note that the parameter is " +
+        "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0")
+    minDF = Param(
+        Params._dummy(), "minDF", "Specifies the minimum number of" +
+        " different documents a term must appear in to be included in the vocabulary." +
+        " If this is an integer >= 1, this specifies the number of documents the term must" +
+        " appear in; if this is a double in [0,1), then this specifies the fraction of documents." +
+        " Default 1.0")
+    vocabSize = Param(
+        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.")
+
+    @keyword_only
+    def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
+        """
+        __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
+        """
+        super(CountVectorizer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
+                                            self.uid)
+        self.minTF = Param(
+            self, "minTF", "Filter to ignore rare words in" +
+            " a document. For each document, terms with frequency/count less than the given" +
+            " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
+            " times the term must appear in the document); if this is a double in [0,1), then " +
+            "this specifies a fraction (out of the document's token count). Note that the " +
+            "parameter is only used in transform of CountVectorizerModel and does not affect" +
+            "fitting. Default 1.0")
+        self.minDF = Param(
+            self, "minDF", "Specifies the minimum number of" +
+            " different documents a term must appear in to be included in the vocabulary." +
+            " If this is an integer >= 1, this specifies the number of documents the term must" +
+            " appear in; if this is a double in [0,1), then this specifies the fraction of " +
+            "documents. Default 1.0")
+        self.vocabSize = Param(
+            self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
+        self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
+        """
+        setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
+        Set the params for the CountVectorizer
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setMinTF(self, value):
+        """
+        Sets the value of :py:attr:`minTF`.
+        """
+        self._paramMap[self.minTF] = value
+        return self
+
+    def getMinTF(self):
+        """
+        Gets the value of minTF or its default value.
+        """
+        return self.getOrDefault(self.minTF)
+
+    def setMinDF(self, value):
+        """
+        Sets the value of :py:attr:`minDF`.
+        """
+        self._paramMap[self.minDF] = value
+        return self
+
+    def getMinDF(self):
+        """
+        Gets the value of minDF or its default value.
+        """
+        return self.getOrDefault(self.minDF)
+
+    def setVocabSize(self, value):
+        """
+        Sets the value of :py:attr:`vocabSize`.
+        """
+        self._paramMap[self.vocabSize] = value
+        return self
+
+    def getVocabSize(self):
+        """
+        Gets the value of vocabSize or its default value.
+        """
+        return self.getOrDefault(self.vocabSize)
+
+    def _create_model(self, java_model):
+        return CountVectorizerModel(java_model)
+
+
+class CountVectorizerModel(JavaModel):
+    """
+    .. note:: Experimental
+
+    Model fitted by CountVectorizer.
+    """
+
+    @property
+    def vocabulary(self):
+        """
+        An array of terms in the vocabulary.
+        """
+        return self._call_java("vocabulary")
+
+
+@inherit_doc
+class DCT(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    A feature transformer that takes the 1D discrete cosine transform
+    of a real vector. No zero padding is performed on the input vector.
+    It returns a real vector of the same length representing the DCT.
+    The return vector is scaled such that the transform matrix is
+    unitary (aka scaled DCT-II).
+
+    More information on
+    `https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
+    >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
+    >>> df2 = dct.transform(df1)
+    >>> df2.head().resultVec
+    DenseVector([10.969..., -0.707..., -2.041...])
+    >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
+    >>> df3.head().origVec
+    DenseVector([5.0, 8.0, 6.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
+                    "default False.")
+
+    @keyword_only
+    def __init__(self, inverse=False, inputCol=None, outputCol=None):
+        """
+        __init__(self, inverse=False, inputCol=None, outputCol=None)
+        """
+        super(DCT, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
+        self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
+                             "default False.")
+        self._setDefault(inverse=False)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inverse=False, inputCol=None, outputCol=None):
+        """
+        setParams(self, inverse=False, inputCol=None, outputCol=None)
+        Sets params for this DCT.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setInverse(self, value):
+        """
+        Sets the value of :py:attr:`inverse`.
+        """
+        self._paramMap[self.inverse] = value
+        return self
+
+    def getInverse(self):
+        """
+        Gets the value of inverse or its default value.
+        """
+        return self.getOrDefault(self.inverse)
+
+
 @inherit_doc
 class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Outputs the Hadamard product (i.e., the element-wise product) of each input vector
     with a provided "weight" vector. In other words, it scales each column of the dataset
     by a scalar multiplier.
@@ -226,6 +434,8 @@ def getScalingVec(self):
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     """
+    .. note:: Experimental
+
     Maps a sequence of terms to their term frequencies using the
     hashing trick.
 
@@ -264,6 +474,8 @@ def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
 @inherit_doc
 class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Compute the Inverse Document Frequency (IDF) given a collection of documents.
 
     >>> from pyspark.mllib.linalg import DenseVector
@@ -324,14 +536,112 @@ def _create_model(self, java_model):
 
 class IDFModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by IDF.
     """
 
 
+@inherit_doc
+class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    Rescale each feature individually to a common range [min, max] linearly using column summary
+    statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
+    feature E is calculated as,
+
+    Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min
+
+    For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)
+
+    Note that since zero values will probably be transformed to non-zero values, output of the
+    transformer will be DenseVector even for sparse input.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
+    >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")
+    >>> model = mmScaler.fit(df)
+    >>> model.transform(df).show()
+    +-----+------+
+    |    a|scaled|
+    +-----+------+
+    |[0.0]| [0.0]|
+    |[2.0]| [1.0]|
+    +-----+------+
+    ...
+    """
+
+    # a placeholder to make it appear in the generated doc
+    min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
+    max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
+
+    @keyword_only
+    def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
+        """
+        __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
+        """
+        super(MinMaxScaler, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
+        self.min = Param(self, "min", "Lower bound of the output feature range")
+        self.max = Param(self, "max", "Upper bound of the output feature range")
+        self._setDefault(min=0.0, max=1.0)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
+        """
+        setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
+        Sets params for this MinMaxScaler.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setMin(self, value):
+        """
+        Sets the value of :py:attr:`min`.
+        """
+        self._paramMap[self.min] = value
+        return self
+
+    def getMin(self):
+        """
+        Gets the value of min or its default value.
+        """
+        return self.getOrDefault(self.min)
+
+    def setMax(self, value):
+        """
+        Sets the value of :py:attr:`max`.
+        """
+        self._paramMap[self.max] = value
+        return self
+
+    def getMax(self):
+        """
+        Gets the value of max or its default value.
+        """
+        return self.getOrDefault(self.max)
+
+    def _create_model(self, java_model):
+        return MinMaxScalerModel(java_model)
+
+
+class MinMaxScalerModel(JavaModel):
+    """
+    .. note:: Experimental
+
+    Model fitted by :py:class:`MinMaxScaler`.
+    """
+
+
 @inherit_doc
 @ignore_unicode_prefix
 class NGram(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A feature transformer that converts the input array of strings into an array of n-grams. Null
     values in the input array are ignored.
     It returns an array of n-grams where each n-gram is represented by a space-separated string of
@@ -400,6 +710,8 @@ def getN(self):
 @inherit_doc
 class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
      Normalize a vector to have unit norm using the given p-norm.
 
     >>> from pyspark.mllib.linalg import Vectors
@@ -456,6 +768,8 @@ def getP(self):
 @inherit_doc
 class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A one-hot encoder that maps a column of category indices to a
     column of binary vectors, with at most a single one-value per row
     that indicates the input category index.
@@ -528,6 +842,8 @@ def getDropLast(self):
 @inherit_doc
 class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
     which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an
     expansion of a product of sums expresses it as a sum of products by using the fact that
@@ -586,6 +902,8 @@ def getDegree(self):
 @ignore_unicode_prefix
 class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A regex based tokenizer that extracts tokens either by using the
     provided regex pattern (in Java dialect) to split the text
     (default) or repeatedly matching the regex (if gaps is false).
@@ -680,9 +998,64 @@ def getPattern(self):
         return self.getOrDefault(self.pattern)
 
 
+@inherit_doc
+class SQLTransformer(JavaTransformer):
+    """
+    .. note:: Experimental
+
+    Implements the transforms which are defined by SQL statement.
+    Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+    where '__THIS__' represents the underlying table of the input dataset.
+
+    >>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
+    >>> sqlTrans = SQLTransformer(
+    ...     statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+    >>> sqlTrans.transform(df).head()
+    Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
+    """
+
+    # a placeholder to make it appear in the generated doc
+    statement = Param(Params._dummy(), "statement", "SQL statement")
+
+    @keyword_only
+    def __init__(self, statement=None):
+        """
+        __init__(self, statement=None)
+        """
+        super(SQLTransformer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
+        self.statement = Param(self, "statement", "SQL statement")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, statement=None):
+        """
+        setParams(self, statement=None)
+        Sets params for this SQLTransformer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setStatement(self, value):
+        """
+        Sets the value of :py:attr:`statement`.
+        """
+        self._paramMap[self.statement] = value
+        return self
+
+    def getStatement(self):
+        """
+        Gets the value of statement or its default value.
+        """
+        return self.getOrDefault(self.statement)
+
+
 @inherit_doc
 class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Standardizes features by removing the mean and scaling to unit variance using column summary
     statistics on the samples in the training set.
 
@@ -756,6 +1129,8 @@ def _create_model(self, java_model):
 
 class StandardScalerModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by StandardScaler.
     """
 
@@ -775,8 +1150,10 @@ def mean(self):
 
 
 @inherit_doc
-class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
+class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid):
     """
+    .. note:: Experimental
+
     A label indexer that maps a string column of labels to an ML column of label indices.
     If the input column is numeric, we cast it to string and index the string values.
     The indices are in [0, numLabels), ordered by label frequencies.
@@ -788,22 +1165,28 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
     ...     key=lambda x: x[0])
     [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]
+    >>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels())
+    >>> itd = inverter.transform(td)
+    >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
+    ...     key=lambda x: x[0])
+    [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
     """
 
     @keyword_only
-    def __init__(self, inputCol=None, outputCol=None):
+    def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        __init__(self, inputCol=None, outputCol=None)
+        __init__(self, inputCol=None, outputCol=None, handleInvalid="error")
         """
         super(StringIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
+        self._setDefault(handleInvalid="error")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, inputCol=None, outputCol=None):
+    def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        setParams(self, inputCol=None, outputCol=None)
+        setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this StringIndexer.
         """
         kwargs = self.setParams._input_kwargs
@@ -815,14 +1198,147 @@ def _create_model(self, java_model):
 
 class StringIndexerModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by StringIndexer.
     """
+    @property
+    def labels(self):
+        """
+        Ordered list of labels, corresponding to indices to be assigned.
+        """
+        return self._java_obj.labels
+
+
+@inherit_doc
+class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    A :py:class:`Transformer` that maps a column of indices back to a new column of
+    corresponding string values.
+    The index-string mapping is either from the ML attributes of the input column,
+    or from user-supplied labels (which take precedence over ML attributes).
+    See L{StringIndexer} for converting strings into indices.
+    """
+
+    # a placeholder to make the labels show up in generated doc
+    labels = Param(Params._dummy(), "labels",
+                   "Optional array of labels specifying index-string mapping." +
+                   " If not provided or if empty, then metadata from inputCol is used instead.")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, labels=None):
+        """
+        __init__(self, inputCol=None, outputCol=None, labels=None)
+        """
+        super(IndexToString, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
+                                            self.uid)
+        self.labels = Param(self, "labels",
+                            "Optional array of labels specifying index-string mapping. If not" +
+                            " provided or if empty, then metadata from inputCol is used instead.")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, labels=None):
+        """
+        setParams(self, inputCol=None, outputCol=None, labels=None)
+        Sets params for this IndexToString.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setLabels(self, value):
+        """
+        Sets the value of :py:attr:`labels`.
+        """
+        self._paramMap[self.labels] = value
+        return self
+
+    def getLabels(self):
+        """
+        Gets the value of :py:attr:`labels` or its default value.
+        """
+        return self.getOrDefault(self.labels)
+
+
+class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    A feature transformer that filters out stop words from input.
+    Note: null values from input array are preserved unless adding null to stopWords explicitly.
+    """
+    # a placeholder to make the stopwords show up in generated doc
+    stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
+    caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
+                          "comparison over the stop words")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, stopWords=None,
+                 caseSensitive=False):
+        """
+        __init__(self, inputCol=None, outputCol=None, stopWords=None,\
+                 caseSensitive=false)
+        """
+        super(StopWordsRemover, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
+                                            self.uid)
+        self.stopWords = Param(self, "stopWords", "The words to be filtered out")
+        self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
+                                   "sensitive comparison over the stop words")
+        stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
+        defaultStopWords = stopWordsObj.English()
+        self._setDefault(stopWords=defaultStopWords)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, stopWords=None,
+                  caseSensitive=False):
+        """
+        setParams(self, inputCol="input", outputCol="output", stopWords=None,\
+                  caseSensitive=false)
+        Sets params for this StopWordRemover.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setStopWords(self, value):
+        """
+        Specify the stopwords to be filtered.
+        """
+        self._paramMap[self.stopWords] = value
+        return self
+
+    def getStopWords(self):
+        """
+        Get the stopwords.
+        """
+        return self.getOrDefault(self.stopWords)
+
+    def setCaseSensitive(self, value):
+        """
+        Set whether to do a case sensitive comparison over the stop words
+        """
+        self._paramMap[self.caseSensitive] = value
+        return self
+
+    def getCaseSensitive(self):
+        """
+        Get whether to do a case sensitive comparison over the stop words.
+        """
+        return self.getOrDefault(self.caseSensitive)
 
 
 @inherit_doc
 @ignore_unicode_prefix
 class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     A tokenizer that converts the input string to lowercase and then
     splits it by white spaces.
 
@@ -868,6 +1384,8 @@ def setParams(self, inputCol=None, outputCol=None):
 @inherit_doc
 class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
     """
+    .. note:: Experimental
+
     A feature transformer that merges multiple columns into a vector column.
 
     >>> df = sqlContext.createDataFrame([(1, 0, 3)], ["a", "b", "c"])
@@ -904,6 +1422,8 @@ def setParams(self, inputCols=None, outputCol=None):
 @inherit_doc
 class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Class for indexing categorical feature columns in a dataset of [[Vector]].
 
     This has 2 usage modes:
@@ -946,6 +1466,10 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     >>> model = indexer.fit(df)
     >>> model.transform(df).head().indexed
     DenseVector([1.0, 0.0])
+    >>> model.numFeatures
+    2
+    >>> model.categoryMaps
+    {0: {0.0: 0, -1.0: 1}}
     >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test
     DenseVector([0.0, 1.0])
     >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}
@@ -1003,14 +1527,119 @@ def _create_model(self, java_model):
 
 class VectorIndexerModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by VectorIndexer.
     """
 
+    @property
+    def numFeatures(self):
+        """
+        Number of features, i.e., length of Vectors which this transforms.
+        """
+        return self._call_java("numFeatures")
+
+    @property
+    def categoryMaps(self):
+        """
+        Feature value index.  Keys are categorical feature indices (column indices).
+        Values are maps from original features values to 0-based category indices.
+        If a feature is not in this map, it is treated as continuous.
+        """
+        return self._call_java("javaCategoryMaps")
+
+
+@inherit_doc
+class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    .. note:: Experimental
+
+    This class takes a feature vector and outputs a new feature vector with a subarray
+    of the original features.
+
+    The subset of features can be specified with either indices (`setIndices()`)
+    or names (`setNames()`).  At least one feature must be selected. Duplicate features
+    are not allowed, so there can be no overlap between selected indices and names.
+
+    The output vector will order features with the selected indices first (in the order given),
+    followed by the selected names (in the order given).
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
+    ...     (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
+    ...     (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
+    >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
+    >>> vs.transform(df).head().sliced
+    DenseVector([2.3, 1.0])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
+                    "a vector column. There can be no overlap with names.")
+    names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
+                  "a vector column. These names must be specified by ML " +
+                  "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
+                  "indices.")
+
+    @keyword_only
+    def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
+        """
+        __init__(self, inputCol=None, outputCol=None, indices=None, names=None)
+        """
+        super(VectorSlicer, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
+        self.indices = Param(self, "indices", "An array of indices to select features from " +
+                             "a vector column. There can be no overlap with names.")
+        self.names = Param(self, "names", "An array of feature names to select features from " +
+                           "a vector column. These names must be specified by ML " +
+                           "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
+                           "with indices.")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
+        """
+        setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
+        Sets params for this VectorSlicer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setIndices(self, value):
+        """
+        Sets the value of :py:attr:`indices`.
+        """
+        self._paramMap[self.indices] = value
+        return self
+
+    def getIndices(self):
+        """
+        Gets the value of indices or its default value.
+        """
+        return self.getOrDefault(self.indices)
+
+    def setNames(self, value):
+        """
+        Sets the value of :py:attr:`names`.
+        """
+        self._paramMap[self.names] = value
+        return self
+
+    def getNames(self):
+        """
+        Gets the value of names or its default value.
+        """
+        return self.getOrDefault(self.names)
+
 
 @inherit_doc
 @ignore_unicode_prefix
 class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further
     natural language processing or machine learning process.
 
@@ -1124,6 +1753,8 @@ def _create_model(self, java_model):
 
 class Word2VecModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by Word2Vec.
     """
 
@@ -1149,6 +1780,8 @@ def findSynonyms(self, word, num):
 @inherit_doc
 class PCA(JavaEstimator, HasInputCol, HasOutputCol):
     """
+    .. note:: Experimental
+
     PCA trains a model to project vectors to a low-dimensional space using PCA.
 
     >>> from pyspark.mllib.linalg import Vectors
@@ -1204,6 +1837,8 @@ def _create_model(self, java_model):
 
 class PCAModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by PCA.
     """
 
@@ -1215,7 +1850,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
 
     Implements the transforms required for fitting a dataset against an
     R model formula. Currently we support a limited subset of the R
-    operators, including '~', '+', '-', and '.'. Also see the R formula
+    operators, including '~', '.', ':', '+', and '-'. Also see the R formula
     docs:
     http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
 
@@ -1287,6 +1922,8 @@ def _create_model(self, java_model):
 
 class RFormulaModel(JavaModel):
     """
+    .. note:: Experimental
+
     Model fitted by :py:class:`RFormula`.
     """
 
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index eeeac49b21980..2e0c63cb47b17 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -164,6 +164,7 @@ def extractParamMap(self, extra=None):
         a flat param map, where the latter value is used if there exist
         conflicts, i.e., with ordering: default param values <
         user-supplied values < extra.
+
         :param extra: extra param values
         :return: merged param map
         """
@@ -182,6 +183,7 @@ def copy(self, extra=None):
         embedded and extra parameters over and returns the copy.
         Subclasses should override this method if the default approach
         is not sufficient.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
@@ -201,6 +203,7 @@ def _shouldOwn(self, param):
     def _resolveParam(self, param):
         """
         Resolves a param and validates the ownership.
+
         :param param: param name or the param instance, which must
                       belong to this Params instance
         :return: resolved param instance
@@ -243,6 +246,7 @@ def _copyValues(self, to, extra=None):
         """
         Copies param values from this instance to another instance for
         params shared by them.
+
         :param to: the target instance
         :param extra: extra params to be copied
         :return: the target instance with param values copied
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 69efc424ec4ef..5b39e5dd4e25b 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -121,7 +121,19 @@ def get$Name(self):
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
         ("seed", "random seed", "hash(type(self).__name__)"),
         ("tol", "the convergence tolerance for iterative algorithms", None),
-        ("stepSize", "Step size to be used for each iteration of optimization.", None)]
+        ("stepSize", "Step size to be used for each iteration of optimization.", None),
+        ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
+         "out rows with bad values), or error (which will throw an errror). More options may be " +
+         "added later.", None),
+        ("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
+         "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0"),
+        ("fitIntercept", "whether to fit an intercept term.", "True"),
+        ("standardization", "whether to standardize the training features before fitting the " +
+         "model.", "True"),
+        ("thresholds", "Thresholds in multi-class classification to adjust the probability of " +
+         "predicting each class. Array must have length equal to the number of classes, with " +
+         "values >= 0. The class with largest value p/t is predicted, where p is the original " +
+         "probability of that class and t is the class' threshold.", None)]
     code = []
     for name, doc, defaultValueStr in shared:
         param_code = _gen_param_header(name, doc, defaultValueStr)
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 595124726366d..af1218128602b 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -432,6 +432,144 @@ def getStepSize(self):
         return self.getOrDefault(self.stepSize)
 
 
+class HasHandleInvalid(Params):
+    """
+    Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.")
+
+    def __init__(self):
+        super(HasHandleInvalid, self).__init__()
+        #: param for how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
+        self.handleInvalid = Param(self, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.")
+
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        self._paramMap[self.handleInvalid] = value
+        return self
+
+    def getHandleInvalid(self):
+        """
+        Gets the value of handleInvalid or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
+
+class HasElasticNetParam(Params):
+    """
+    Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+
+    def __init__(self):
+        super(HasElasticNetParam, self).__init__()
+        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
+        self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+        self._setDefault(elasticNetParam=0.0)
+
+    def setElasticNetParam(self, value):
+        """
+        Sets the value of :py:attr:`elasticNetParam`.
+        """
+        self._paramMap[self.elasticNetParam] = value
+        return self
+
+    def getElasticNetParam(self):
+        """
+        Gets the value of elasticNetParam or its default value.
+        """
+        return self.getOrDefault(self.elasticNetParam)
+
+
+class HasFitIntercept(Params):
+    """
+    Mixin for param fitIntercept: whether to fit an intercept term..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
+
+    def __init__(self):
+        super(HasFitIntercept, self).__init__()
+        #: param for whether to fit an intercept term.
+        self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
+        self._setDefault(fitIntercept=True)
+
+    def setFitIntercept(self, value):
+        """
+        Sets the value of :py:attr:`fitIntercept`.
+        """
+        self._paramMap[self.fitIntercept] = value
+        return self
+
+    def getFitIntercept(self):
+        """
+        Gets the value of fitIntercept or its default value.
+        """
+        return self.getOrDefault(self.fitIntercept)
+
+
+class HasStandardization(Params):
+    """
+    Mixin for param standardization: whether to standardize the training features before fitting the model..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.")
+
+    def __init__(self):
+        super(HasStandardization, self).__init__()
+        #: param for whether to standardize the training features before fitting the model.
+        self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.")
+        self._setDefault(standardization=True)
+
+    def setStandardization(self, value):
+        """
+        Sets the value of :py:attr:`standardization`.
+        """
+        self._paramMap[self.standardization] = value
+        return self
+
+    def getStandardization(self):
+        """
+        Gets the value of standardization or its default value.
+        """
+        return self.getOrDefault(self.standardization)
+
+
+class HasThresholds(Params):
+    """
+    Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
+
+    def __init__(self):
+        super(HasThresholds, self).__init__()
+        #: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
+        self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
+
+    def setThresholds(self, value):
+        """
+        Sets the value of :py:attr:`thresholds`.
+        """
+        self._paramMap[self.thresholds] = value
+        return self
+
+    def getThresholds(self):
+        """
+        Gets the value of thresholds or its default value.
+        """
+        return self.getOrDefault(self.thresholds)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
@@ -444,7 +582,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+    
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
@@ -460,7 +598,7 @@ def __init__(self):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 13cf2b0f7bbd9..312a8502b3a2c 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -154,6 +154,7 @@ def __init__(self, stages=None):
     def setStages(self, value):
         """
         Set pipeline stages.
+
         :param value: a list of transformers or estimators
         :return: the pipeline instance
         """
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index b06099ac0aee6..ec5748a1cfe94 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -80,6 +81,8 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     Row(user=1, item=0, prediction=3.19...)
     >>> predictions[2]
     Row(user=2, item=0, prediction=-1.15...)
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -122,6 +125,7 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
                   implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
                   ratingCol="rating", nonnegative=False, checkpointInterval=10):
@@ -137,6 +141,7 @@ def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItem
     def _create_model(self, java_model):
         return ALSModel(java_model)
 
+    @since("1.4.0")
     def setRank(self, value):
         """
         Sets the value of :py:attr:`rank`.
@@ -144,12 +149,14 @@ def setRank(self, value):
         self._paramMap[self.rank] = value
         return self
 
+    @since("1.4.0")
     def getRank(self):
         """
         Gets the value of rank or its default value.
         """
         return self.getOrDefault(self.rank)
 
+    @since("1.4.0")
     def setNumUserBlocks(self, value):
         """
         Sets the value of :py:attr:`numUserBlocks`.
@@ -157,12 +164,14 @@ def setNumUserBlocks(self, value):
         self._paramMap[self.numUserBlocks] = value
         return self
 
+    @since("1.4.0")
     def getNumUserBlocks(self):
         """
         Gets the value of numUserBlocks or its default value.
         """
         return self.getOrDefault(self.numUserBlocks)
 
+    @since("1.4.0")
     def setNumItemBlocks(self, value):
         """
         Sets the value of :py:attr:`numItemBlocks`.
@@ -170,12 +179,14 @@ def setNumItemBlocks(self, value):
         self._paramMap[self.numItemBlocks] = value
         return self
 
+    @since("1.4.0")
     def getNumItemBlocks(self):
         """
         Gets the value of numItemBlocks or its default value.
         """
         return self.getOrDefault(self.numItemBlocks)
 
+    @since("1.4.0")
     def setNumBlocks(self, value):
         """
         Sets both :py:attr:`numUserBlocks` and :py:attr:`numItemBlocks` to the specific value.
@@ -183,6 +194,7 @@ def setNumBlocks(self, value):
         self._paramMap[self.numUserBlocks] = value
         self._paramMap[self.numItemBlocks] = value
 
+    @since("1.4.0")
     def setImplicitPrefs(self, value):
         """
         Sets the value of :py:attr:`implicitPrefs`.
@@ -190,12 +202,14 @@ def setImplicitPrefs(self, value):
         self._paramMap[self.implicitPrefs] = value
         return self
 
+    @since("1.4.0")
     def getImplicitPrefs(self):
         """
         Gets the value of implicitPrefs or its default value.
         """
         return self.getOrDefault(self.implicitPrefs)
 
+    @since("1.4.0")
     def setAlpha(self, value):
         """
         Sets the value of :py:attr:`alpha`.
@@ -203,12 +217,14 @@ def setAlpha(self, value):
         self._paramMap[self.alpha] = value
         return self
 
+    @since("1.4.0")
     def getAlpha(self):
         """
         Gets the value of alpha or its default value.
         """
         return self.getOrDefault(self.alpha)
 
+    @since("1.4.0")
     def setUserCol(self, value):
         """
         Sets the value of :py:attr:`userCol`.
@@ -216,12 +232,14 @@ def setUserCol(self, value):
         self._paramMap[self.userCol] = value
         return self
 
+    @since("1.4.0")
     def getUserCol(self):
         """
         Gets the value of userCol or its default value.
         """
         return self.getOrDefault(self.userCol)
 
+    @since("1.4.0")
     def setItemCol(self, value):
         """
         Sets the value of :py:attr:`itemCol`.
@@ -229,12 +247,14 @@ def setItemCol(self, value):
         self._paramMap[self.itemCol] = value
         return self
 
+    @since("1.4.0")
     def getItemCol(self):
         """
         Gets the value of itemCol or its default value.
         """
         return self.getOrDefault(self.itemCol)
 
+    @since("1.4.0")
     def setRatingCol(self, value):
         """
         Sets the value of :py:attr:`ratingCol`.
@@ -242,12 +262,14 @@ def setRatingCol(self, value):
         self._paramMap[self.ratingCol] = value
         return self
 
+    @since("1.4.0")
     def getRatingCol(self):
         """
         Gets the value of ratingCol or its default value.
         """
         return self.getOrDefault(self.ratingCol)
 
+    @since("1.4.0")
     def setNonnegative(self, value):
         """
         Sets the value of :py:attr:`nonnegative`.
@@ -255,6 +277,7 @@ def setNonnegative(self, value):
         self._paramMap[self.nonnegative] = value
         return self
 
+    @since("1.4.0")
     def getNonnegative(self):
         """
         Gets the value of nonnegative or its default value.
@@ -265,14 +288,18 @@ def getNonnegative(self):
 class ALSModel(JavaModel):
     """
     Model fitted by ALS.
+
+    .. versionadded:: 1.4.0
     """
 
     @property
+    @since("1.4.0")
     def rank(self):
         """rank of the matrix factorization model"""
         return self._call_java("rank")
 
     @property
+    @since("1.4.0")
     def userFactors(self):
         """
         a DataFrame that stores user factors in two columns: `id` and
@@ -281,6 +308,7 @@ def userFactors(self):
         return self._call_java("userFactors")
 
     @property
+    @since("1.4.0")
     def itemFactors(self):
         """
         a DataFrame that stores item factors in two columns: `id` and
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 44f60a769566d..21d454f9003bb 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
@@ -28,7 +29,8 @@
 
 @inherit_doc
 class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
-                       HasRegParam, HasTol):
+                       HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
+                       HasStandardization):
     """
     Linear regression.
 
@@ -61,40 +63,35 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.
-    """
 
-    # a placeholder to make it appear in the generated doc
-    elasticNetParam = \
-        Param(Params._dummy(), "elasticNetParam",
-              "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-              "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+    .. versionadded:: 1.4.0
+    """
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
+                 standardization=True):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+                 maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
+                 standardization=True)
         """
         super(LinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.LinearRegression", self.uid)
-        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
-        #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
-        self.elasticNetParam = \
-            Param(self, "elasticNetParam",
-                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
-                  "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
-        self._setDefault(maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+        self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
+                  standardization=True):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
+                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
+                  standardization=True)
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs
@@ -103,6 +100,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LinearRegressionModel(java_model)
 
+    @since("1.4.0")
     def setElasticNetParam(self, value):
         """
         Sets the value of :py:attr:`elasticNetParam`.
@@ -110,6 +108,7 @@ def setElasticNetParam(self, value):
         self._paramMap[self.elasticNetParam] = value
         return self
 
+    @since("1.4.0")
     def getElasticNetParam(self):
         """
         Gets the value of elasticNetParam or its default value.
@@ -120,9 +119,12 @@ def getElasticNetParam(self):
 class LinearRegressionModel(JavaModel):
     """
     Model fitted by LinearRegression.
+
+    .. versionadded:: 1.4.0
     """
 
     @property
+    @since("1.4.0")
     def weights(self):
         """
         Model weights.
@@ -130,6 +132,7 @@ def weights(self):
         return self._call_java("weights")
 
     @property
+    @since("1.4.0")
     def intercept(self):
         """
         Model intercept.
@@ -182,6 +185,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     1.0
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -213,6 +218,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
@@ -229,6 +235,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return DecisionTreeRegressionModel(java_model)
 
+    @since("1.4.0")
     def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
@@ -236,6 +243,7 @@ def setImpurity(self, value):
         self._paramMap[self.impurity] = value
         return self
 
+    @since("1.4.0")
     def getImpurity(self):
         """
         Gets the value of impurity or its default value.
@@ -245,13 +253,19 @@ def getImpurity(self):
 
 @inherit_doc
 class DecisionTreeModel(JavaModel):
+    """Abstraction for Decision Tree models.
+
+    .. versionadded:: 1.5.0
+    """
 
     @property
+    @since("1.5.0")
     def numNodes(self):
         """Return number of nodes of the decision tree."""
         return self._call_java("numNodes")
 
     @property
+    @since("1.5.0")
     def depth(self):
         """Return depth of the decision tree."""
         return self._call_java("depth")
@@ -262,8 +276,13 @@ def __repr__(self):
 
 @inherit_doc
 class TreeEnsembleModels(JavaModel):
+    """Represents a tree ensemble model.
+
+    .. versionadded:: 1.5.0
+    """
 
     @property
+    @since("1.5.0")
     def treeWeights(self):
         """Return the weights for each tree"""
         return list(self._call_java("javaTreeWeights"))
@@ -276,6 +295,8 @@ def __repr__(self):
 class DecisionTreeRegressionModel(DecisionTreeModel):
     """
     Model fitted by DecisionTreeRegressor.
+
+    .. versionadded:: 1.4.0
     """
 
 
@@ -302,6 +323,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     0.5
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -356,6 +379,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
@@ -373,6 +397,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return RandomForestRegressionModel(java_model)
 
+    @since("1.4.0")
     def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
@@ -380,12 +405,14 @@ def setImpurity(self, value):
         self._paramMap[self.impurity] = value
         return self
 
+    @since("1.4.0")
     def getImpurity(self):
         """
         Gets the value of impurity or its default value.
         """
         return self.getOrDefault(self.impurity)
 
+    @since("1.4.0")
     def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
@@ -393,12 +420,14 @@ def setSubsamplingRate(self, value):
         self._paramMap[self.subsamplingRate] = value
         return self
 
+    @since("1.4.0")
     def getSubsamplingRate(self):
         """
         Gets the value of subsamplingRate or its default value.
         """
         return self.getOrDefault(self.subsamplingRate)
 
+    @since("1.4.0")
     def setNumTrees(self, value):
         """
         Sets the value of :py:attr:`numTrees`.
@@ -406,12 +435,14 @@ def setNumTrees(self, value):
         self._paramMap[self.numTrees] = value
         return self
 
+    @since("1.4.0")
     def getNumTrees(self):
         """
         Gets the value of numTrees or its default value.
         """
         return self.getOrDefault(self.numTrees)
 
+    @since("1.4.0")
     def setFeatureSubsetStrategy(self, value):
         """
         Sets the value of :py:attr:`featureSubsetStrategy`.
@@ -419,6 +450,7 @@ def setFeatureSubsetStrategy(self, value):
         self._paramMap[self.featureSubsetStrategy] = value
         return self
 
+    @since("1.4.0")
     def getFeatureSubsetStrategy(self):
         """
         Gets the value of featureSubsetStrategy or its default value.
@@ -429,6 +461,8 @@ def getFeatureSubsetStrategy(self):
 class RandomForestRegressionModel(TreeEnsembleModels):
     """
     Model fitted by RandomForestRegressor.
+
+    .. versionadded:: 1.4.0
     """
 
 
@@ -455,6 +489,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     1.0
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -501,6 +537,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self.setParams(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
@@ -518,6 +555,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return GBTRegressionModel(java_model)
 
+    @since("1.4.0")
     def setLossType(self, value):
         """
         Sets the value of :py:attr:`lossType`.
@@ -525,12 +563,14 @@ def setLossType(self, value):
         self._paramMap[self.lossType] = value
         return self
 
+    @since("1.4.0")
     def getLossType(self):
         """
         Gets the value of lossType or its default value.
         """
         return self.getOrDefault(self.lossType)
 
+    @since("1.4.0")
     def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
@@ -538,12 +578,14 @@ def setSubsamplingRate(self, value):
         self._paramMap[self.subsamplingRate] = value
         return self
 
+    @since("1.4.0")
     def getSubsamplingRate(self):
         """
         Gets the value of subsamplingRate or its default value.
         """
         return self.getOrDefault(self.subsamplingRate)
 
+    @since("1.4.0")
     def setStepSize(self, value):
         """
         Sets the value of :py:attr:`stepSize`.
@@ -551,6 +593,7 @@ def setStepSize(self, value):
         self._paramMap[self.stepSize] = value
         return self
 
+    @since("1.4.0")
     def getStepSize(self):
         """
         Gets the value of stepSize or its default value.
@@ -561,6 +604,8 @@ def getStepSize(self):
 class GBTRegressionModel(TreeEnsembleModels):
     """
     Model fitted by GBTRegressor.
+
+    .. versionadded:: 1.4.0
     """
 
 
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 60e4237293adc..648fa8858fba3 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -31,7 +31,7 @@
     import unittest
 
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
-from pyspark.sql import DataFrame, SQLContext
+from pyspark.sql import DataFrame, SQLContext, Row
 from pyspark.sql.functions import rand
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.param import Param, Params
@@ -182,7 +182,7 @@ def test_params(self):
         self.assertEqual(testParams.getMaxIter(), 10)
         testParams.setMaxIter(100)
         self.assertTrue(testParams.isSet(maxIter))
-        self.assertEquals(testParams.getMaxIter(), 100)
+        self.assertEqual(testParams.getMaxIter(), 100)
 
         self.assertTrue(testParams.hasParam(inputCol))
         self.assertFalse(testParams.hasDefault(inputCol))
@@ -195,7 +195,7 @@ def test_params(self):
         testParams._setDefault(seed=41)
         testParams.setSeed(43)
 
-        self.assertEquals(
+        self.assertEqual(
             testParams.explainParams(),
             "\n".join(["inputCol: input column name (undefined)",
                        "maxIter: max number of iterations (>= 0) (default: 10, current: 100)",
@@ -258,13 +258,29 @@ def test_idf(self):
     def test_ngram(self):
         sqlContext = SQLContext(self.sc)
         dataset = sqlContext.createDataFrame([
-            ([["a", "b", "c", "d", "e"]])], ["input"])
+            Row(input=["a", "b", "c", "d", "e"])])
         ngram0 = NGram(n=4, inputCol="input", outputCol="output")
         self.assertEqual(ngram0.getN(), 4)
         self.assertEqual(ngram0.getInputCol(), "input")
         self.assertEqual(ngram0.getOutputCol(), "output")
         transformedDF = ngram0.transform(dataset)
-        self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"])
+        self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
+
+    def test_stopwordsremover(self):
+        sqlContext = SQLContext(self.sc)
+        dataset = sqlContext.createDataFrame([Row(input=["a", "panda"])])
+        stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
+        # Default
+        self.assertEqual(stopWordRemover.getInputCol(), "input")
+        transformedDF = stopWordRemover.transform(dataset)
+        self.assertEqual(transformedDF.head().output, ["panda"])
+        # Custom
+        stopwords = ["panda"]
+        stopWordRemover.setStopWords(stopwords)
+        self.assertEqual(stopWordRemover.getInputCol(), "input")
+        self.assertEqual(stopWordRemover.getStopWords(), stopwords)
+        transformedDF = stopWordRemover.transform(dataset)
+        self.assertEqual(transformedDF.head().output, ["a"])
 
 
 class HasInducedError(Params):
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index cae778869e9c5..705ee53685752 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -18,6 +18,7 @@
 import itertools
 import numpy as np
 
+from pyspark import since
 from pyspark.ml.param import Params, Param
 from pyspark.ml import Estimator, Model
 from pyspark.ml.util import keyword_only
@@ -47,11 +48,14 @@ class ParamGridBuilder(object):
     True
     >>> all([m in expected for m in output])
     True
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self):
         self._param_grid = {}
 
+    @since("1.4.0")
     def addGrid(self, param, values):
         """
         Sets the given parameters in this grid to fixed values.
@@ -60,6 +64,7 @@ def addGrid(self, param, values):
 
         return self
 
+    @since("1.4.0")
     def baseOn(self, *args):
         """
         Sets the given parameters in this grid to fixed values.
@@ -73,6 +78,7 @@ def baseOn(self, *args):
 
         return self
 
+    @since("1.4.0")
     def build(self):
         """
         Builds and returns all combinations of parameters specified
@@ -104,6 +110,8 @@ class CrossValidator(Estimator):
     >>> cvModel = cv.fit(dataset)
     >>> evaluator.evaluate(cvModel.transform(dataset))
     0.8333...
+
+    .. versionadded:: 1.4.0
     """
 
     # a placeholder to make it appear in the generated doc
@@ -142,6 +150,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF
         self._set(**kwargs)
 
     @keyword_only
+    @since("1.4.0")
     def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3):
         """
         setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3):
@@ -150,6 +159,7 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, num
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+    @since("1.4.0")
     def setEstimator(self, value):
         """
         Sets the value of :py:attr:`estimator`.
@@ -157,12 +167,14 @@ def setEstimator(self, value):
         self._paramMap[self.estimator] = value
         return self
 
+    @since("1.4.0")
     def getEstimator(self):
         """
         Gets the value of estimator or its default value.
         """
         return self.getOrDefault(self.estimator)
 
+    @since("1.4.0")
     def setEstimatorParamMaps(self, value):
         """
         Sets the value of :py:attr:`estimatorParamMaps`.
@@ -170,12 +182,14 @@ def setEstimatorParamMaps(self, value):
         self._paramMap[self.estimatorParamMaps] = value
         return self
 
+    @since("1.4.0")
     def getEstimatorParamMaps(self):
         """
         Gets the value of estimatorParamMaps or its default value.
         """
         return self.getOrDefault(self.estimatorParamMaps)
 
+    @since("1.4.0")
     def setEvaluator(self, value):
         """
         Sets the value of :py:attr:`evaluator`.
@@ -183,12 +197,14 @@ def setEvaluator(self, value):
         self._paramMap[self.evaluator] = value
         return self
 
+    @since("1.4.0")
     def getEvaluator(self):
         """
         Gets the value of evaluator or its default value.
         """
         return self.getOrDefault(self.evaluator)
 
+    @since("1.4.0")
     def setNumFolds(self, value):
         """
         Sets the value of :py:attr:`numFolds`.
@@ -196,6 +212,7 @@ def setNumFolds(self, value):
         self._paramMap[self.numFolds] = value
         return self
 
+    @since("1.4.0")
     def getNumFolds(self):
         """
         Gets the value of numFolds or its default value.
@@ -231,7 +248,16 @@ def _fit(self, dataset):
         bestModel = est.fit(dataset, epm[bestIndex])
         return CrossValidatorModel(bestModel)
 
+    @since("1.4.0")
     def copy(self, extra=None):
+        """
+        Creates a copy of this instance with a randomly generated uid
+        and some extra params. This copies creates a deep copy of
+        the embedded paramMap, and copies the embedded and extra parameters over.
+
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
         if extra is None:
             extra = dict()
         newCV = Params.copy(self, extra)
@@ -246,6 +272,8 @@ def copy(self, extra=None):
 class CrossValidatorModel(Model):
     """
     Model from k-fold cross validation.
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, bestModel):
@@ -256,12 +284,14 @@ def __init__(self, bestModel):
     def _transform(self, dataset):
         return self.bestModel.transform(dataset)
 
+    @since("1.4.0")
     def copy(self, extra=None):
         """
         Creates a copy of this instance with a randomly generated uid
         and some extra params. This copies the underlying bestModel,
         creates a deep copy of the embedded paramMap, and
         copies the embedded and extra parameters over.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 253705bde913e..4bcb4aaec89de 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -119,6 +119,7 @@ def _create_model(self, java_model):
     def _fit_java(self, dataset):
         """
         Fits a Java model to the input dataset.
+
         :param dataset: input dataset, which is an instance of
                         :py:class:`pyspark.sql.DataFrame`
         :param params: additional params (overwriting embedded values)
@@ -136,7 +137,8 @@ def _fit(self, dataset):
 class JavaTransformer(Transformer, JavaWrapper):
     """
     Base class for :py:class:`Transformer`s that wrap Java/Scala
-    implementations.
+    implementations. Subclasses should ensure they have the transformer Java object
+    available as _java_obj.
     """
 
     __metaclass__ = ABCMeta
@@ -172,6 +174,7 @@ def copy(self, extra=None):
         extra params. This implementation first calls Params.copy and
         then make a copy of the companion Java model with extra params.
         So both the Python wrapper and the Java model get copied.
+
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
         """
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 8f27c446a66e8..cb4ee83678081 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -241,7 +241,7 @@ class LogisticRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.01, regType="l2", intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a logistic regression model on the given data.
 
@@ -274,11 +274,13 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
                                  float(step), float(miniBatchFraction), i, float(regParam), regType,
-                                 bool(intercept), bool(validateData))
+                                 bool(intercept), bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights)
 
@@ -439,7 +441,7 @@ class SVMWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, regType="l2",
-              intercept=False, validateData=True):
+              intercept=False, validateData=True, convergenceTol=0.001):
         """
         Train a support vector machine on the given data.
 
@@ -472,11 +474,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, regType,
-                                 bool(intercept), bool(validateData))
+                                 bool(intercept), bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, SVMModel, data, initialWeights)
 
@@ -600,12 +604,15 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
     :param miniBatchFraction: Fraction of data on which SGD is run for each
                               iteration.
     :param regParam: L2 Regularization parameter.
+    :param convergenceTol: A condition which decides iteration termination.
     """
-    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01):
+    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
+                 convergenceTol=0.001):
         self.stepSize = stepSize
         self.numIterations = numIterations
         self.regParam = regParam
         self.miniBatchFraction = miniBatchFraction
+        self.convergenceTol = convergenceTol
         self._model = None
         super(StreamingLogisticRegressionWithSGD, self).__init__(
             model=self._model)
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 4398ca86f2ec2..a90e5c50e54b9 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -147,7 +147,7 @@ class MulticlassMetrics(JavaModelWrapper):
     """
     Evaluator for multiclass classification.
 
-    :param predictionAndLabels an RDD of (prediction, label) pairs.
+    :param predictionAndLabels: an RDD of (prediction, label) pairs.
 
     >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
     ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index f921e3ad1a314..7b077b058c3fd 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -30,7 +30,7 @@
 
 from py4j.protocol import Py4JJavaError
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import (
@@ -84,11 +84,14 @@ class Normalizer(VectorTransformer):
     >>> nor2 = Normalizer(float("inf"))
     >>> nor2.transform(v)
     DenseVector([0.0, 0.5, 1.0])
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, p=2.0):
         assert p >= 1.0, "p should be greater than 1.0"
         self.p = float(p)
 
+    @since('1.2.0')
     def transform(self, vector):
         """
         Applies unit length normalization on a vector.
@@ -133,7 +136,11 @@ class StandardScalerModel(JavaVectorTransformer):
     .. note:: Experimental
 
     Represents a StandardScaler model that can transform vectors.
+
+    .. versionadded:: 1.2.0
     """
+
+    @since('1.2.0')
     def transform(self, vector):
         """
         Applies standardization transformation on a vector.
@@ -149,6 +156,7 @@ def transform(self, vector):
         """
         return JavaVectorTransformer.transform(self, vector)
 
+    @since('1.4.0')
     def setWithMean(self, withMean):
         """
         Setter of the boolean which decides
@@ -157,6 +165,7 @@ def setWithMean(self, withMean):
         self.call("setWithMean", withMean)
         return self
 
+    @since('1.4.0')
     def setWithStd(self, withStd):
         """
         Setter of the boolean which decides
@@ -189,6 +198,8 @@ class StandardScaler(object):
     >>> for r in result.collect(): r
     DenseVector([-0.7071, 0.7071, -0.7071])
     DenseVector([0.7071, -0.7071, 0.7071])
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, withMean=False, withStd=True):
         if not (withMean or withStd):
@@ -196,6 +207,7 @@ def __init__(self, withMean=False, withStd=True):
         self.withMean = withMean
         self.withStd = withStd
 
+    @since('1.2.0')
     def fit(self, dataset):
         """
         Computes the mean and variance and stores as a model to be used
@@ -215,7 +227,11 @@ class ChiSqSelectorModel(JavaVectorTransformer):
     .. note:: Experimental
 
     Represents a Chi Squared selector model.
+
+    .. versionadded:: 1.4.0
     """
+
+    @since('1.4.0')
     def transform(self, vector):
         """
         Applies transformation on a vector.
@@ -245,10 +261,13 @@ class ChiSqSelector(object):
     SparseVector(1, {0: 6.0})
     >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
     DenseVector([5.0])
+
+    .. versionadded:: 1.4.0
     """
     def __init__(self, numTopFeatures):
         self.numTopFeatures = int(numTopFeatures)
 
+    @since('1.4.0')
     def fit(self, data):
         """
         Returns a ChiSquared feature selector.
@@ -265,6 +284,8 @@ def fit(self, data):
 class PCAModel(JavaVectorTransformer):
     """
     Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
+
+    .. versionadded:: 1.5.0
     """
 
 
@@ -281,6 +302,8 @@ class PCA(object):
     1.648...
     >>> pcArray[1]
     -4.013...
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, k):
         """
@@ -288,6 +311,7 @@ def __init__(self, k):
         """
         self.k = int(k)
 
+    @since('1.5.0')
     def fit(self, data):
         """
         Computes a [[PCAModel]] that contains the principal components of the input vectors.
@@ -312,14 +336,18 @@ class HashingTF(object):
     >>> doc = "a a b b c d".split(" ")
     >>> htf.transform(doc)
     SparseVector(100, {...})
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, numFeatures=1 << 20):
         self.numFeatures = numFeatures
 
+    @since('1.2.0')
     def indexOf(self, term):
         """ Returns the index of the input term. """
         return hash(term) % self.numFeatures
 
+    @since('1.2.0')
     def transform(self, document):
         """
         Transforms the input document (list of terms) to term frequency
@@ -339,7 +367,10 @@ def transform(self, document):
 class IDFModel(JavaVectorTransformer):
     """
     Represents an IDF model that can transform term frequency vectors.
+
+    .. versionadded:: 1.2.0
     """
+    @since('1.2.0')
     def transform(self, x):
         """
         Transforms term frequency (TF) vectors to TF-IDF vectors.
@@ -358,6 +389,7 @@ def transform(self, x):
         """
         return JavaVectorTransformer.transform(self, x)
 
+    @since('1.4.0')
     def idf(self):
         """
         Returns the current IDF vector.
@@ -401,10 +433,13 @@ class IDF(object):
     DenseVector([0.0, 0.0, 1.3863, 0.863])
     >>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
     SparseVector(4, {1: 0.0, 3: 0.5754})
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self, minDocFreq=0):
         self.minDocFreq = minDocFreq
 
+    @since('1.2.0')
     def fit(self, dataset):
         """
         Computes the inverse document frequency.
@@ -420,7 +455,10 @@ def fit(self, dataset):
 class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
     """
     class for Word2Vec model
+
+    .. versionadded:: 1.2.0
     """
+    @since('1.2.0')
     def transform(self, word):
         """
         Transforms a word to its vector representation
@@ -435,6 +473,7 @@ def transform(self, word):
         except Py4JJavaError:
             raise ValueError("%s not found" % word)
 
+    @since('1.2.0')
     def findSynonyms(self, word, num):
         """
         Find synonyms of a word
@@ -450,6 +489,7 @@ def findSynonyms(self, word, num):
         words, similarity = self.call("findSynonyms", word, num)
         return zip(words, similarity)
 
+    @since('1.4.0')
     def getVectors(self):
         """
         Returns a map of words to their vector representations.
@@ -457,7 +497,11 @@ def getVectors(self):
         return self.call("getVectors")
 
     @classmethod
+    @since('1.5.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         jmodel = sc._jvm.org.apache.spark.mllib.feature \
             .Word2VecModel.load(sc._jsc.sc(), path)
         return Word2VecModel(jmodel)
@@ -507,6 +551,8 @@ class Word2Vec(object):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 1.2.0
     """
     def __init__(self):
         """
@@ -519,6 +565,7 @@ def __init__(self):
         self.seed = random.randint(0, sys.maxsize)
         self.minCount = 5
 
+    @since('1.2.0')
     def setVectorSize(self, vectorSize):
         """
         Sets vector size (default: 100).
@@ -526,6 +573,7 @@ def setVectorSize(self, vectorSize):
         self.vectorSize = vectorSize
         return self
 
+    @since('1.2.0')
     def setLearningRate(self, learningRate):
         """
         Sets initial learning rate (default: 0.025).
@@ -533,6 +581,7 @@ def setLearningRate(self, learningRate):
         self.learningRate = learningRate
         return self
 
+    @since('1.2.0')
     def setNumPartitions(self, numPartitions):
         """
         Sets number of partitions (default: 1). Use a small number for
@@ -541,6 +590,7 @@ def setNumPartitions(self, numPartitions):
         self.numPartitions = numPartitions
         return self
 
+    @since('1.2.0')
     def setNumIterations(self, numIterations):
         """
         Sets number of iterations (default: 1), which should be smaller
@@ -549,6 +599,7 @@ def setNumIterations(self, numIterations):
         self.numIterations = numIterations
         return self
 
+    @since('1.2.0')
     def setSeed(self, seed):
         """
         Sets random seed.
@@ -556,6 +607,7 @@ def setSeed(self, seed):
         self.seed = seed
         return self
 
+    @since('1.4.0')
     def setMinCount(self, minCount):
         """
         Sets minCount, the minimum number of times a token must appear
@@ -564,6 +616,7 @@ def setMinCount(self, minCount):
         self.minCount = minCount
         return self
 
+    @since('1.2.0')
     def fit(self, data):
         """
         Computes the vector representation of each word in vocabulary.
@@ -596,10 +649,13 @@ class ElementwiseProduct(VectorTransformer):
     >>> rdd = sc.parallelize([a, b])
     >>> eprod.transform(rdd).collect()
     [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, scalingVector):
         self.scalingVector = _convert_to_vector(scalingVector)
 
+    @since('1.5.0')
     def transform(self, vector):
         """
         Computes the Hadamard product of the vector.
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index bdc4a132b1b18..bdabba9602a8c 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -19,7 +19,7 @@
 from numpy import array
 from collections import namedtuple
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
 
@@ -41,8 +41,11 @@ class FPGrowthModel(JavaModelWrapper):
     >>> model = FPGrowth.train(rdd, 0.6, 2)
     >>> sorted(model.freqItemsets().collect())
     [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
+
+    .. versionadded:: 1.4.0
     """
 
+    @since("1.4.0")
     def freqItemsets(self):
         """
         Returns the frequent itemsets of this model.
@@ -55,9 +58,12 @@ class FPGrowth(object):
     .. note:: Experimental
 
     A Parallel FP-growth algorithm to mine frequent itemsets.
+
+    .. versionadded:: 1.4.0
     """
 
     @classmethod
+    @since("1.4.0")
     def train(cls, data, minSupport=0.3, numPartitions=-1):
         """
         Computes an FP-Growth model that contains frequent itemsets.
@@ -74,6 +80,8 @@ def train(cls, data, minSupport=0.3, numPartitions=-1):
     class FreqItemset(namedtuple("FreqItemset", ["items", "freq"])):
         """
         Represents an (items, freq) tuple.
+
+        .. versionadded:: 1.4.0
         """
 
 
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 334dc8e38bb8f..ea42127f1651f 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -25,6 +25,7 @@
 
 import sys
 import array
+import struct
 
 if sys.version >= '3':
     basestring = str
@@ -122,6 +123,13 @@ def _format_float_list(l):
     return [_format_float(x) for x in l]
 
 
+def _double_to_long_bits(value):
+    if np.isnan(value):
+        value = float('nan')
+    # pack double into 64 bits, then unpack as long int
+    return struct.unpack('Q', struct.pack('d', value))[0]
+
+
 class VectorUDT(UserDefinedType):
     """
     SQL user-defined type (UDT) for Vector.
@@ -232,6 +240,7 @@ class Vector(object):
     def toArray(self):
         """
         Convert the vector into an numpy.ndarray
+
         :return: numpy.ndarray
         """
         raise NotImplementedError
@@ -293,11 +302,14 @@ def __reduce__(self):
         return DenseVector, (self.array.tostring(),)
 
     def numNonzeros(self):
+        """
+        Number of nonzero elements. This scans all active values and count non zeros
+        """
         return np.count_nonzero(self.array)
 
     def norm(self, p):
         """
-        Calculte the norm of a DenseVector.
+        Calculates the norm of a DenseVector.
 
         >>> a = DenseVector([0, -1, 2, -3])
         >>> a.norm(2)
@@ -389,6 +401,16 @@ def squared_distance(self, other):
         return np.dot(diff, diff)
 
     def toArray(self):
+        """
+        Returns an numpy.ndarray
+        """
+        return self.array
+
+    @property
+    def values(self):
+        """
+        Returns a list of values
+        """
         return self.array
 
     def __getitem__(self, item):
@@ -404,11 +426,31 @@ def __repr__(self):
         return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
 
     def __eq__(self, other):
-        return isinstance(other, DenseVector) and np.array_equal(self.array, other.array)
+        if isinstance(other, DenseVector):
+            return np.array_equal(self.array, other.array)
+        elif isinstance(other, SparseVector):
+            if len(self) != other.size:
+                return False
+            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+        return False
 
     def __ne__(self, other):
         return not self == other
 
+    def __hash__(self):
+        size = len(self)
+        result = 31 + size
+        nnz = 0
+        i = 0
+        while i < size and nnz < 128:
+            if self.array[i] != 0:
+                result = 31 * result + i
+                bits = _double_to_long_bits(self.array[i])
+                result = 31 * result + (bits ^ (bits >> 32))
+                nnz += 1
+            i += 1
+        return result
+
     def __getattr__(self, item):
         return getattr(self.array, item)
 
@@ -447,8 +489,8 @@ def __init__(self, size, *args):
 
         :param size: Size of the vector.
         :param args: Active entries, as a dictionary {index: value, ...},
-          a list of tuples [(index, value), ...], or a list of strictly i
-          ncreasing indices and a list of corresponding values [index, ...],
+          a list of tuples [(index, value), ...], or a list of strictly
+          increasing indices and a list of corresponding values [index, ...],
           [value, ...]. Inactive entries are treated as zeros.
 
         >>> SparseVector(4, {1: 1.0, 3: 5.5})
@@ -489,11 +531,14 @@ def __init__(self, size, *args):
                     raise TypeError("indices array must be sorted")
 
     def numNonzeros(self):
+        """
+        Number of nonzero elements. This scans all active values and count non zeros.
+        """
         return np.count_nonzero(self.values)
 
     def norm(self, p):
         """
-        Calculte the norm of a SparseVector.
+        Calculates the norm of a SparseVector.
 
         >>> a = SparseVector(4, [0, 1], [3., -4.])
         >>> a.norm(1)
@@ -704,20 +749,14 @@ def __repr__(self):
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
-        """
-        Test SparseVectors for equality.
-
-        >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v1 == v2
-        True
-        >>> v1 != v2
-        False
-        """
-        return (isinstance(other, self.__class__)
-                and other.size == self.size
-                and np.array_equal(other.indices, self.indices)
-                and np.array_equal(other.values, self.values))
+        if isinstance(other, SparseVector):
+            return other.size == self.size and np.array_equal(other.indices, self.indices) \
+                and np.array_equal(other.values, self.values)
+        elif isinstance(other, DenseVector):
+            if self.size != len(other):
+                return False
+            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+        return False
 
     def __getitem__(self, index):
         inds = self.indices
@@ -739,6 +778,19 @@ def __getitem__(self, index):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def __hash__(self):
+        result = 31 + self.size
+        nnz = 0
+        i = 0
+        while i < len(self.values) and nnz < 128:
+            if self.values[i] != 0:
+                result = 31 * result + int(self.indices[i])
+                bits = _double_to_long_bits(self.values[i])
+                result = 31 * result + (bits ^ (bits >> 32))
+                nnz += 1
+            i += 1
+        return result
+
 
 class Vectors(object):
 
@@ -758,7 +810,7 @@ def sparse(size, *args):
         values (sorted by index).
 
         :param size: Size of the vector.
-        :param args: Non-zero entries, as a dictionary, list of tupes,
+        :param args: Non-zero entries, as a dictionary, list of tuples,
                      or two sorted lists containing indices and values.
 
         >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
@@ -841,6 +893,31 @@ def parse(s):
     def zeros(size):
         return DenseVector(np.zeros(size))
 
+    @staticmethod
+    def _equals(v1_indices, v1_values, v2_indices, v2_values):
+        """
+        Check equality between sparse/dense vectors,
+        v1_indices and v2_indices assume to be strictly increasing.
+        """
+        v1_size = len(v1_values)
+        v2_size = len(v2_values)
+        k1 = 0
+        k2 = 0
+        all_equal = True
+        while all_equal:
+            while k1 < v1_size and v1_values[k1] == 0:
+                k1 += 1
+            while k2 < v2_size and v2_values[k2] == 0:
+                k2 += 1
+
+            if k1 >= v1_size or k2 >= v2_size:
+                return k1 >= v1_size and k2 >= v2_size
+
+            all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
+            k1 += 1
+            k2 += 1
+        return all_equal
+
 
 class Matrix(object):
 
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 06fbc0eb6aef0..6a3c643b66417 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -21,6 +21,7 @@
 
 from functools import wraps
 
+from pyspark import since
 from pyspark.mllib.common import callMLlibFunc
 
 
@@ -39,9 +40,12 @@ class RandomRDDs(object):
     """
     Generator methods for creating RDDs comprised of i.i.d samples from
     some distribution.
+
+    .. versionadded:: 1.1.0
     """
 
     @staticmethod
+    @since("1.1.0")
     def uniformRDD(sc, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the
@@ -72,6 +76,7 @@ def uniformRDD(sc, size, numPartitions=None, seed=None):
         return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed)
 
     @staticmethod
+    @since("1.1.0")
     def normalRDD(sc, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the standard normal
@@ -100,6 +105,7 @@ def normalRDD(sc, size, numPartitions=None, seed=None):
         return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
 
     @staticmethod
+    @since("1.3.0")
     def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the log normal
@@ -132,6 +138,7 @@ def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):
                              size, numPartitions, seed)
 
     @staticmethod
+    @since("1.1.0")
     def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the Poisson
@@ -158,6 +165,7 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
         return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
 
     @staticmethod
+    @since("1.3.0")
     def exponentialRDD(sc, mean, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the Exponential
@@ -184,6 +192,7 @@ def exponentialRDD(sc, mean, size, numPartitions=None, seed=None):
         return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
 
     @staticmethod
+    @since("1.3.0")
     def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of i.i.d. samples from the Gamma
@@ -216,6 +225,7 @@ def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.1.0")
     def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -241,6 +251,7 @@ def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.1.0")
     def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -266,6 +277,7 @@ def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.3.0")
     def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -300,6 +312,7 @@ def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed
 
     @staticmethod
     @toArray
+    @since("1.1.0")
     def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -330,6 +343,7 @@ def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
 
     @staticmethod
     @toArray
+    @since("1.3.0")
     def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
@@ -360,6 +374,7 @@ def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=No
 
     @staticmethod
     @toArray
+    @since("1.3.0")
     def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None):
         """
         Generates an RDD comprised of vectors containing i.i.d. samples drawn
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 506ca2151cce7..95047b5b7b4b7 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -18,7 +18,7 @@
 import array
 from collections import namedtuple
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.rdd import RDD
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
 from pyspark.mllib.util import JavaLoader, JavaSaveable
@@ -36,6 +36,8 @@ class Rating(namedtuple("Rating", ["user", "product", "rating"])):
     (1, 2, 5.0)
     >>> (r[0], r[1], r[2])
     (1, 2, 5.0)
+
+    .. versionadded:: 1.2.0
     """
 
     def __reduce__(self):
@@ -111,13 +113,17 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
+
+    .. versionadded:: 0.9.0
     """
+    @since("0.9.0")
     def predict(self, user, product):
         """
         Predicts rating for the given user and product.
         """
         return self._java_model.predict(int(user), int(product))
 
+    @since("0.9.0")
     def predictAll(self, user_product):
         """
         Returns a list of predicted ratings for input user and product pairs.
@@ -128,6 +134,7 @@ def predictAll(self, user_product):
         user_product = user_product.map(lambda u_p: (int(u_p[0]), int(u_p[1])))
         return self.call("predict", user_product)
 
+    @since("1.2.0")
     def userFeatures(self):
         """
         Returns a paired RDD, where the first element is the user and the
@@ -135,6 +142,7 @@ def userFeatures(self):
         """
         return self.call("getUserFeatures").mapValues(lambda v: array.array('d', v))
 
+    @since("1.2.0")
     def productFeatures(self):
         """
         Returns a paired RDD, where the first element is the product and the
@@ -142,6 +150,7 @@ def productFeatures(self):
         """
         return self.call("getProductFeatures").mapValues(lambda v: array.array('d', v))
 
+    @since("1.4.0")
     def recommendUsers(self, product, num):
         """
         Recommends the top "num" number of users for a given product and returns a list
@@ -149,6 +158,7 @@ def recommendUsers(self, product, num):
         """
         return list(self.call("recommendUsers", product, num))
 
+    @since("1.4.0")
     def recommendProducts(self, user, num):
         """
         Recommends the top "num" number of products for a given user and returns a list
@@ -157,17 +167,25 @@ def recommendProducts(self, user, num):
         return list(self.call("recommendProducts", user, num))
 
     @property
+    @since("1.4.0")
     def rank(self):
+        """Rank for the features in this model"""
         return self.call("rank")
 
     @classmethod
+    @since("1.3.1")
     def load(cls, sc, path):
+        """Load a model from the given path"""
         model = cls._load_java(sc, path)
         wrapper = sc._jvm.MatrixFactorizationModelWrapper(model)
         return MatrixFactorizationModel(wrapper)
 
 
 class ALS(object):
+    """Alternating Least Squares matrix factorization
+
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
     def _prepare(cls, ratings):
@@ -188,15 +206,31 @@ def _prepare(cls, ratings):
         return ratings
 
     @classmethod
+    @since("0.9.0")
     def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
               seed=None):
+        """
+        Train a matrix factorization model given an RDD of ratings given by users to some products,
+        in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
+        product of two lower-rank matrices of a given rank (number of features). To solve for these
+        features, we run a given number of iterations of ALS. This is done using a level of
+        parallelism given by `blocks`.
+        """
         model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
                               lambda_, blocks, nonnegative, seed)
         return MatrixFactorizationModel(model)
 
     @classmethod
+    @since("0.9.0")
     def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01,
                       nonnegative=False, seed=None):
+        """
+        Train a matrix factorization model given an RDD of 'implicit preferences' given by users
+        to some products, in the form of (userID, productID, preference) pairs. We approximate the
+        ratings matrix as the product of two lower-rank matrices of a given rank (number of
+        features).  To solve for these features, we run a given number of iterations of ALS.
+        This is done using a level of parallelism given by `blocks`.
+        """
         model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank,
                               iterations, lambda_, blocks, alpha, nonnegative, seed)
         return MatrixFactorizationModel(model)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 41946e3674fbe..256b7537fef6b 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -28,7 +28,8 @@
            'LinearRegressionModel', 'LinearRegressionWithSGD',
            'RidgeRegressionModel', 'RidgeRegressionWithSGD',
            'LassoModel', 'LassoWithSGD', 'IsotonicRegressionModel',
-           'IsotonicRegression']
+           'IsotonicRegression', 'StreamingLinearAlgorithm',
+           'StreamingLinearRegressionWithSGD']
 
 
 class LabeledPoint(object):
@@ -202,7 +203,7 @@ class LinearRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.0, regType=None, intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a linear regression model using Stochastic Gradient
         Descent (SGD).
@@ -244,11 +245,14 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
                                  float(step), float(miniBatchFraction), i, float(regParam),
-                                 regType, bool(intercept), bool(validateData))
+                                 regType, bool(intercept), bool(validateData),
+                                 float(convergenceTol))
 
         return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)
 
@@ -330,7 +334,7 @@ class LassoWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a regression model with L1-regularization using
         Stochastic Gradient Descent.
@@ -362,11 +366,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
-                                 bool(validateData))
+                                 bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, LassoModel, data, initialWeights)
 
@@ -449,7 +455,7 @@ class RidgeRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
-              validateData=True):
+              validateData=True, convergenceTol=0.001):
         """
         Train a regression model with L2-regularization using
         Stochastic Gradient Descent.
@@ -481,11 +487,13 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         :param validateData:      Boolean parameter which indicates if
                                   the algorithm should validate data
                                   before training. (default: True)
+        :param convergenceTol:    A condition which decides iteration termination.
+                                  (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
-                                 bool(validateData))
+                                 bool(validateData), float(convergenceTol))
 
         return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights)
 
@@ -636,15 +644,17 @@ class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
     After training on a batch of data, the weights obtained at the end of
     training are used as initial weights for the next batch.
 
-    :param: stepSize Step size for each iteration of gradient descent.
-    :param: numIterations Total number of iterations run.
-    :param: miniBatchFraction Fraction of data on which SGD is run for each
+    :param stepSize: Step size for each iteration of gradient descent.
+    :param numIterations: Total number of iterations run.
+    :param miniBatchFraction: Fraction of data on which SGD is run for each
                               iteration.
+    :param convergenceTol: A condition which decides iteration termination.
     """
-    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0):
+    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, convergenceTol=0.001):
         self.stepSize = stepSize
         self.numIterations = numIterations
         self.miniBatchFraction = miniBatchFraction
+        self.convergenceTol = convergenceTol
         self._model = None
         super(StreamingLinearRegressionWithSGD, self).__init__(
             model=self._model)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 5097c5e8ba4cd..96cf13495aa95 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -166,13 +166,13 @@ def test_dot(self):
                      [1., 2., 3., 4.],
                      [1., 2., 3., 4.]])
         arr = pyarray.array('d', [0, 1, 2, 3])
-        self.assertEquals(10.0, sv.dot(dv))
+        self.assertEqual(10.0, sv.dot(dv))
         self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
-        self.assertEquals(30.0, dv.dot(dv))
+        self.assertEqual(30.0, dv.dot(dv))
         self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
-        self.assertEquals(30.0, lst.dot(dv))
+        self.assertEqual(30.0, lst.dot(dv))
         self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
-        self.assertEquals(7.0, sv.dot(arr))
+        self.assertEqual(7.0, sv.dot(arr))
 
     def test_squared_distance(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -181,18 +181,50 @@ def test_squared_distance(self):
         lst1 = [4, 3, 2, 1]
         arr = pyarray.array('d', [0, 2, 1, 3])
         narr = array([0, 2, 1, 3])
-        self.assertEquals(15.0, _squared_distance(sv, dv))
-        self.assertEquals(25.0, _squared_distance(sv, lst))
-        self.assertEquals(20.0, _squared_distance(dv, lst))
-        self.assertEquals(15.0, _squared_distance(dv, sv))
-        self.assertEquals(25.0, _squared_distance(lst, sv))
-        self.assertEquals(20.0, _squared_distance(lst, dv))
-        self.assertEquals(0.0, _squared_distance(sv, sv))
-        self.assertEquals(0.0, _squared_distance(dv, dv))
-        self.assertEquals(0.0, _squared_distance(lst, lst))
-        self.assertEquals(25.0, _squared_distance(sv, lst1))
-        self.assertEquals(3.0, _squared_distance(sv, arr))
-        self.assertEquals(3.0, _squared_distance(sv, narr))
+        self.assertEqual(15.0, _squared_distance(sv, dv))
+        self.assertEqual(25.0, _squared_distance(sv, lst))
+        self.assertEqual(20.0, _squared_distance(dv, lst))
+        self.assertEqual(15.0, _squared_distance(dv, sv))
+        self.assertEqual(25.0, _squared_distance(lst, sv))
+        self.assertEqual(20.0, _squared_distance(lst, dv))
+        self.assertEqual(0.0, _squared_distance(sv, sv))
+        self.assertEqual(0.0, _squared_distance(dv, dv))
+        self.assertEqual(0.0, _squared_distance(lst, lst))
+        self.assertEqual(25.0, _squared_distance(sv, lst1))
+        self.assertEqual(3.0, _squared_distance(sv, arr))
+        self.assertEqual(3.0, _squared_distance(sv, narr))
+
+    def test_hash(self):
+        v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        self.assertEqual(hash(v1), hash(v2))
+        self.assertEqual(hash(v1), hash(v3))
+        self.assertEqual(hash(v2), hash(v3))
+        self.assertFalse(hash(v1) == hash(v4))
+        self.assertFalse(hash(v2) == hash(v4))
+
+    def test_eq(self):
+        v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
+        v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
+        v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        self.assertEqual(v1, v2)
+        self.assertEqual(v1, v3)
+        self.assertFalse(v2 == v4)
+        self.assertFalse(v1 == v5)
+        self.assertFalse(v1 == v6)
+
+    def test_equals(self):
+        indices = [1, 2, 4]
+        values = [1., 3., 2.]
+        self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
 
     def test_conversion(self):
         # numpy arrays should be automatically upcast to float64
@@ -206,13 +238,13 @@ def test_conversion(self):
 
     def test_sparse_vector_indexing(self):
         sv = SparseVector(4, {1: 1, 3: 2})
-        self.assertEquals(sv[0], 0.)
-        self.assertEquals(sv[3], 2.)
-        self.assertEquals(sv[1], 1.)
-        self.assertEquals(sv[2], 0.)
-        self.assertEquals(sv[-1], 2)
-        self.assertEquals(sv[-2], 0)
-        self.assertEquals(sv[-4], 0)
+        self.assertEqual(sv[0], 0.)
+        self.assertEqual(sv[3], 2.)
+        self.assertEqual(sv[1], 1.)
+        self.assertEqual(sv[2], 0.)
+        self.assertEqual(sv[-1], 2)
+        self.assertEqual(sv[-2], 0)
+        self.assertEqual(sv[-4], 0)
         for ind in [4, -5]:
             self.assertRaises(ValueError, sv.__getitem__, ind)
         for ind in [7.8, '1']:
@@ -223,7 +255,7 @@ def test_matrix_indexing(self):
         expected = [[0, 6], [1, 8], [4, 10]]
         for i in range(3):
             for j in range(2):
-                self.assertEquals(mat[i, j], expected[i][j])
+                self.assertEqual(mat[i, j], expected[i][j])
 
     def test_repr_dense_matrix(self):
         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
@@ -276,11 +308,11 @@ def test_sparse_matrix(self):
         # Test sparse matrix creation.
         sm1 = SparseMatrix(
             3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
-        self.assertEquals(sm1.numRows, 3)
-        self.assertEquals(sm1.numCols, 4)
-        self.assertEquals(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
-        self.assertEquals(sm1.rowIndices.tolist(), [1, 2, 1, 2])
-        self.assertEquals(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
+        self.assertEqual(sm1.numRows, 3)
+        self.assertEqual(sm1.numCols, 4)
+        self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
+        self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2])
+        self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
         self.assertTrue(
             repr(sm1),
             'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
@@ -293,13 +325,13 @@ def test_sparse_matrix(self):
 
         for i in range(3):
             for j in range(4):
-                self.assertEquals(expected[i][j], sm1[i, j])
+                self.assertEqual(expected[i][j], sm1[i, j])
         self.assertTrue(array_equal(sm1.toArray(), expected))
 
         # Test conversion to dense and sparse.
         smnew = sm1.toDense().toSparse()
-        self.assertEquals(sm1.numRows, smnew.numRows)
-        self.assertEquals(sm1.numCols, smnew.numCols)
+        self.assertEqual(sm1.numRows, smnew.numRows)
+        self.assertEqual(sm1.numCols, smnew.numCols)
         self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs))
         self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices))
         self.assertTrue(array_equal(sm1.values, smnew.values))
@@ -307,11 +339,11 @@ def test_sparse_matrix(self):
         sm1t = SparseMatrix(
             3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
             isTransposed=True)
-        self.assertEquals(sm1t.numRows, 3)
-        self.assertEquals(sm1t.numCols, 4)
-        self.assertEquals(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
-        self.assertEquals(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
-        self.assertEquals(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
+        self.assertEqual(sm1t.numRows, 3)
+        self.assertEqual(sm1t.numCols, 4)
+        self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5])
+        self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2])
+        self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0])
 
         expected = [
             [3, 2, 0, 0],
@@ -320,18 +352,18 @@ def test_sparse_matrix(self):
 
         for i in range(3):
             for j in range(4):
-                self.assertEquals(expected[i][j], sm1t[i, j])
+                self.assertEqual(expected[i][j], sm1t[i, j])
         self.assertTrue(array_equal(sm1t.toArray(), expected))
 
     def test_dense_matrix_is_transposed(self):
         mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
         mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
-        self.assertEquals(mat1, mat)
+        self.assertEqual(mat1, mat)
 
         expected = [[0, 4], [1, 6], [3, 9]]
         for i in range(3):
             for j in range(2):
-                self.assertEquals(mat1[i, j], expected[i][j])
+                self.assertEqual(mat1[i, j], expected[i][j])
         self.assertTrue(array_equal(mat1.toArray(), expected))
 
         sm = mat1.toSparse()
@@ -380,8 +412,8 @@ def test_kmeans(self):
         ]
         clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||",
                                 initializationSteps=7, epsilon=1e-4)
-        self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
-        self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
+        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
+        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
 
     def test_kmeans_deterministic(self):
         from pyspark.mllib.clustering import KMeans
@@ -411,8 +443,8 @@ def test_gmm(self):
         clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
                                          maxIterations=10, seed=56)
         labels = clusters.predict(data).collect()
-        self.assertEquals(labels[0], labels[1])
-        self.assertEquals(labels[2], labels[3])
+        self.assertEqual(labels[0], labels[1])
+        self.assertEqual(labels[2], labels[3])
 
     def test_gmm_deterministic(self):
         from pyspark.mllib.clustering import GaussianMixture
@@ -424,7 +456,7 @@ def test_gmm_deterministic(self):
         clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
                                           maxIterations=10, seed=63)
         for c1, c2 in zip(clusters1.weights, clusters2.weights):
-            self.assertEquals(round(c1, 7), round(c2, 7))
+            self.assertEqual(round(c1, 7), round(c2, 7))
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
@@ -679,18 +711,18 @@ def test_serialize(self):
         lil[1, 0] = 1
         lil[3, 0] = 2
         sv = SparseVector(4, {1: 1, 3: 2})
-        self.assertEquals(sv, _convert_to_vector(lil))
-        self.assertEquals(sv, _convert_to_vector(lil.tocsc()))
-        self.assertEquals(sv, _convert_to_vector(lil.tocoo()))
-        self.assertEquals(sv, _convert_to_vector(lil.tocsr()))
-        self.assertEquals(sv, _convert_to_vector(lil.todok()))
+        self.assertEqual(sv, _convert_to_vector(lil))
+        self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
+        self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
+        self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
+        self.assertEqual(sv, _convert_to_vector(lil.todok()))
 
         def serialize(l):
             return ser.loads(ser.dumps(_convert_to_vector(l)))
-        self.assertEquals(sv, serialize(lil))
-        self.assertEquals(sv, serialize(lil.tocsc()))
-        self.assertEquals(sv, serialize(lil.tocsr()))
-        self.assertEquals(sv, serialize(lil.todok()))
+        self.assertEqual(sv, serialize(lil))
+        self.assertEqual(sv, serialize(lil.tocsc()))
+        self.assertEqual(sv, serialize(lil.tocsr()))
+        self.assertEqual(sv, serialize(lil.todok()))
 
     def test_dot(self):
         from scipy.sparse import lil_matrix
@@ -698,7 +730,7 @@ def test_dot(self):
         lil[1, 0] = 1
         lil[3, 0] = 2
         dv = DenseVector(array([1., 2., 3., 4.]))
-        self.assertEquals(10.0, dv.dot(lil))
+        self.assertEqual(10.0, dv.dot(lil))
 
     def test_squared_distance(self):
         from scipy.sparse import lil_matrix
@@ -707,8 +739,8 @@ def test_squared_distance(self):
         lil[3, 0] = 2
         dv = DenseVector(array([1., 2., 3., 4.]))
         sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
-        self.assertEquals(15.0, dv.squared_distance(lil))
-        self.assertEquals(15.0, sv.squared_distance(lil))
+        self.assertEqual(15.0, dv.squared_distance(lil))
+        self.assertEqual(15.0, sv.squared_distance(lil))
 
     def scipy_matrix(self, size, values):
         """Create a column SciPy matrix from a dictionary of values"""
@@ -727,8 +759,8 @@ def test_clustering(self):
             self.scipy_matrix(3, {2: 1.1})
         ]
         clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
-        self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
-        self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
+        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
+        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
 
     def test_classification(self):
         from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
@@ -952,12 +984,12 @@ def test_word2vec_setters(self):
             .setNumIterations(10) \
             .setSeed(1024) \
             .setMinCount(3)
-        self.assertEquals(model.vectorSize, 2)
+        self.assertEqual(model.vectorSize, 2)
         self.assertTrue(model.learningRate < 0.02)
-        self.assertEquals(model.numPartitions, 2)
-        self.assertEquals(model.numIterations, 10)
-        self.assertEquals(model.seed, 1024)
-        self.assertEquals(model.minCount, 3)
+        self.assertEqual(model.numPartitions, 2)
+        self.assertEqual(model.numIterations, 10)
+        self.assertEqual(model.seed, 1024)
+        self.assertEqual(model.minCount, 3)
 
     def test_word2vec_get_vectors(self):
         data = [
@@ -970,7 +1002,7 @@ def test_word2vec_get_vectors(self):
             ["a"]
         ]
         model = Word2Vec().fit(self.sc.parallelize(data))
-        self.assertEquals(len(model.getVectors()), 3)
+        self.assertEqual(len(model.getVectors()), 3)
 
 
 class StandardScalerTests(MLlibTestCase):
@@ -1012,8 +1044,8 @@ def test_model_params(self):
         """Test that the model params are set correctly"""
         stkm = StreamingKMeans()
         stkm.setK(5).setDecayFactor(0.0)
-        self.assertEquals(stkm._k, 5)
-        self.assertEquals(stkm._decayFactor, 0.0)
+        self.assertEqual(stkm._k, 5)
+        self.assertEqual(stkm._decayFactor, 0.0)
 
         # Model not set yet.
         self.assertIsNone(stkm.latestModel())
@@ -1021,9 +1053,9 @@ def test_model_params(self):
 
         stkm.setInitialCenters(
             centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
-        self.assertEquals(
+        self.assertEqual(
             stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
-        self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])
+        self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
 
     def test_accuracy_for_single_center(self):
         """Test that parameters obtained are correct for a single center."""
@@ -1038,7 +1070,7 @@ def test_accuracy_for_single_center(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
+            self.assertEqual(stkm.latestModel().clusterWeights, [25.0])
             return True
         self._eventually(condition, catch_assertions=True)
 
@@ -1082,7 +1114,7 @@ def test_trainOn_model(self):
         def condition():
             finalModel = stkm.latestModel()
             self.assertTrue(all(finalModel.centers == array(initCenters)))
-            self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
+            self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
             return True
         self._eventually(condition, catch_assertions=True)
 
@@ -1109,7 +1141,7 @@ def update(rdd):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(result, [[0], [1], [2], [3]])
+            self.assertEqual(result, [[0], [1], [2], [3]])
             return True
 
         self._eventually(condition, catch_assertions=True)
@@ -1231,7 +1263,7 @@ def test_convergence(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(models), len(input_batches))
+            self.assertEqual(len(models), len(input_batches))
             return True
 
         # We want all batches to finish for this test.
@@ -1265,7 +1297,7 @@ def test_predictions(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(true_predicted), len(input_batches))
+            self.assertEqual(len(true_predicted), len(input_batches))
             return True
 
         self._eventually(condition, catch_assertions=True)
@@ -1368,7 +1400,7 @@ def test_parameter_convergence(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(model_weights), len(batches))
+            self.assertEqual(len(model_weights), len(batches))
             return True
 
         # We want all batches to finish for this test.
@@ -1401,7 +1433,7 @@ def test_prediction(self):
         self.ssc.start()
 
         def condition():
-            self.assertEquals(len(samples), len(batches))
+            self.assertEqual(len(samples), len(batches))
             return True
 
         # We want all batches to finish for this test.
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 372b86a7c95d9..0001b60093a69 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -19,7 +19,7 @@
 
 import random
 
-from pyspark import SparkContext, RDD
+from pyspark import SparkContext, RDD, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
@@ -30,6 +30,11 @@
 
 
 class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
+    """TreeEnsembleModel
+
+    .. versionadded:: 1.3.0
+    """
+    @since("1.3.0")
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points using
@@ -45,12 +50,14 @@ def predict(self, x):
         else:
             return self.call("predict", _convert_to_vector(x))
 
+    @since("1.3.0")
     def numTrees(self):
         """
         Get number of trees in ensemble.
         """
         return self.call("numTrees")
 
+    @since("1.3.0")
     def totalNumNodes(self):
         """
         Get total number of nodes, summed over all trees in the
@@ -62,6 +69,7 @@ def __repr__(self):
         """ Summary of model """
         return self._java_model.toString()
 
+    @since("1.3.0")
     def toDebugString(self):
         """ Full model """
         return self._java_model.toDebugString()
@@ -72,7 +80,10 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     .. note:: Experimental
 
     A decision tree model for classification or regression.
+
+    .. versionadded:: 1.1.0
     """
+    @since("1.1.0")
     def predict(self, x):
         """
         Predict the label of one or more examples.
@@ -90,16 +101,23 @@ def predict(self, x):
         else:
             return self.call("predict", _convert_to_vector(x))
 
+    @since("1.1.0")
     def numNodes(self):
+        """Get number of nodes in tree, including leaf nodes."""
         return self._java_model.numNodes()
 
+    @since("1.1.0")
     def depth(self):
+        """Get depth of tree.
+        E.g.: Depth 0 means 1 leaf node.  Depth 1 means 1 internal node and 2 leaf nodes.
+        """
         return self._java_model.depth()
 
     def __repr__(self):
         """ summary of model. """
         return self._java_model.toString()
 
+    @since("1.2.0")
     def toDebugString(self):
         """ full model. """
         return self._java_model.toDebugString()
@@ -115,6 +133,8 @@ class DecisionTree(object):
 
     Learning algorithm for a decision tree model for classification or
     regression.
+
+    .. versionadded:: 1.1.0
     """
 
     @classmethod
@@ -127,6 +147,7 @@ def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, m
         return DecisionTreeModel(model)
 
     @classmethod
+    @since("1.1.0")
     def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
                         impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                         minInfoGain=0.0):
@@ -185,6 +206,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
                           impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
 
     @classmethod
+    @since("1.1.0")
     def trainRegressor(cls, data, categoricalFeaturesInfo,
                        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                        minInfoGain=0.0):
@@ -239,6 +261,8 @@ class RandomForestModel(TreeEnsembleModel, JavaLoader):
     .. note:: Experimental
 
     Represents a random forest model.
+
+    .. versionadded:: 1.2.0
     """
 
     @classmethod
@@ -252,6 +276,8 @@ class RandomForest(object):
 
     Learning algorithm for a random forest model for classification or
     regression.
+
+    .. versionadded:: 1.2.0
     """
 
     supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
@@ -271,6 +297,7 @@ def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
         return RandomForestModel(model)
 
     @classmethod
+    @since("1.2.0")
     def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
                         featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
                         seed=None):
@@ -352,6 +379,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
                           maxDepth, maxBins, seed)
 
     @classmethod
+    @since("1.2.0")
     def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
                        impurity="variance", maxDepth=4, maxBins=32, seed=None):
         """
@@ -418,6 +446,8 @@ class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader):
     .. note:: Experimental
 
     Represents a gradient-boosted tree model.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -431,6 +461,8 @@ class GradientBoostedTrees(object):
 
     Learning algorithm for a gradient boosted trees model for
     classification or regression.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -443,6 +475,7 @@ def _train(cls, data, algo, categoricalFeaturesInfo,
         return GradientBoostedTreesModel(model)
 
     @classmethod
+    @since("1.3.0")
     def trainClassifier(cls, data, categoricalFeaturesInfo,
                         loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
                         maxBins=32):
@@ -505,6 +538,7 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
                           loss, numIterations, learningRate, maxDepth, maxBins)
 
     @classmethod
+    @since("1.3.0")
     def trainRegressor(cls, data, categoricalFeaturesInfo,
                        loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
                        maxBins=32):
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 10a1e4b3eb0fc..39bc6586dd582 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -23,7 +23,7 @@
     xrange = range
     basestring = str
 
-from pyspark import SparkContext
+from pyspark import SparkContext, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc
 from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
 
@@ -32,6 +32,8 @@ class MLUtils(object):
 
     """
     Helper methods to load, save and pre-process data used in MLlib.
+
+    .. versionadded:: 1.0.0
     """
 
     @staticmethod
@@ -69,6 +71,7 @@ def _convert_labeled_point_to_libsvm(p):
         return " ".join(items)
 
     @staticmethod
+    @since("1.0.0")
     def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
         """
         Loads labeled data in the LIBSVM format into an RDD of
@@ -123,6 +126,7 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None
         return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
 
     @staticmethod
+    @since("1.0.0")
     def saveAsLibSVMFile(data, dir):
         """
         Save labeled data in LIBSVM format.
@@ -147,6 +151,7 @@ def saveAsLibSVMFile(data, dir):
         lines.saveAsTextFile(dir)
 
     @staticmethod
+    @since("1.1.0")
     def loadLabeledPoints(sc, path, minPartitions=None):
         """
         Load labeled points saved using RDD.saveAsTextFile.
@@ -172,6 +177,7 @@ def loadLabeledPoints(sc, path, minPartitions=None):
         return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
 
     @staticmethod
+    @since("1.5.0")
     def appendBias(data):
         """
         Returns a new vector with `1.0` (bias) appended to
@@ -186,6 +192,7 @@ def appendBias(data):
             return _convert_to_vector(np.append(vec.toArray(), 1.0))
 
     @staticmethod
+    @since("1.5.0")
     def loadVectors(sc, path):
         """
         Loads vectors saved using `RDD[Vector].saveAsTextFile`
@@ -197,6 +204,8 @@ def loadVectors(sc, path):
 class Saveable(object):
     """
     Mixin for models and transformers which may be saved as files.
+
+    .. versionadded:: 1.3.0
     """
 
     def save(self, sc, path):
@@ -222,9 +231,13 @@ class JavaSaveable(Saveable):
     """
     Mixin for models that provide save() through their Scala
     implementation.
+
+    .. versionadded:: 1.3.0
     """
 
+    @since("1.3.0")
     def save(self, sc, path):
+        """Save this model to the given path."""
         if not isinstance(sc, SparkContext):
             raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
         if not isinstance(path, basestring):
@@ -235,6 +248,8 @@ def save(self, sc, path):
 class Loader(object):
     """
     Mixin for classes which can load saved models from files.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -256,6 +271,8 @@ class JavaLoader(Loader):
     """
     Mixin for classes which can load saved models using its Scala
     implementation.
+
+    .. versionadded:: 1.3.0
     """
 
     @classmethod
@@ -280,15 +297,21 @@ def _load_java(cls, sc, path):
         return java_obj.load(sc._jsc.sc(), path)
 
     @classmethod
+    @since("1.3.0")
     def load(cls, sc, path):
+        """Load a model from the given path."""
         java_model = cls._load_java(sc, path)
         return cls(java_model)
 
 
 class LinearDataGenerator(object):
-    """Utils for generating linear data"""
+    """Utils for generating linear data.
+
+    .. versionadded:: 1.5.0
+    """
 
     @staticmethod
+    @since("1.5.0")
     def generateLinearInput(intercept, weights, xMean, xVariance,
                             nPoints, seed, eps):
         """
@@ -311,6 +334,7 @@ def generateLinearInput(intercept, weights, xMean, xVariance,
             xVariance, int(nPoints), int(seed), float(eps)))
 
     @staticmethod
+    @since("1.5.0")
     def generateLinearRDD(sc, nexamples, nfeatures, eps,
                           nParts=2, intercept=0.0):
         """
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 9ef60a7e2c84b..56e892243c79c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -48,7 +48,7 @@
 from pyspark.rddsampler import RDDSampler, RDDRangeSampler, RDDStratifiedSampler
 from pyspark.storagelevel import StorageLevel
 from pyspark.resultiterable import ResultIterable
-from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, \
+from pyspark.shuffle import Aggregator, ExternalMerger, \
     get_used_memory, ExternalSorter, ExternalGroupBy
 from pyspark.traceback_utils import SCCallSiteSync
 
@@ -84,7 +84,7 @@ def portable_hash(x):
         h ^= len(x)
         if h == -1:
             h = -2
-        return h
+        return int(h)
     return hash(x)
 
 
@@ -580,12 +580,11 @@ def repartitionAndSortWithinPartitions(self, numPartitions=None, partitionFunc=p
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
-        spill = (self.ctx._conf.get("spark.shuffle.spill", 'True').lower() == "true")
         memory = _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m"))
         serializer = self._jrdd_deserializer
 
         def sortPartition(iterator):
-            sort = ExternalSorter(memory * 0.9, serializer).sorted if spill else sorted
+            sort = ExternalSorter(memory * 0.9, serializer).sorted
             return iter(sort(iterator, key=lambda k_v: keyfunc(k_v[0]), reverse=(not ascending)))
 
         return self.partitionBy(numPartitions, partitionFunc).mapPartitions(sortPartition, True)
@@ -610,12 +609,11 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
 
-        spill = self._can_spill()
         memory = self._memory_limit()
         serializer = self._jrdd_deserializer
 
         def sortPartition(iterator):
-            sort = ExternalSorter(memory * 0.9, serializer).sorted if spill else sorted
+            sort = ExternalSorter(memory * 0.9, serializer).sorted
             return iter(sort(iterator, key=lambda kv: keyfunc(kv[0]), reverse=(not ascending)))
 
         if numPartitions == 1:
@@ -688,7 +686,7 @@ def cartesian(self, other):
                                              other._jrdd_deserializer)
         return RDD(self._jrdd.cartesian(other._jrdd), self.ctx, deserializer)
 
-    def groupBy(self, f, numPartitions=None):
+    def groupBy(self, f, numPartitions=None, partitionFunc=portable_hash):
         """
         Return an RDD of grouped items.
 
@@ -697,7 +695,7 @@ def groupBy(self, f, numPartitions=None):
         >>> sorted([(x, sorted(y)) for (x, y) in result])
         [(0, [2, 8]), (1, [1, 1, 3, 5])]
         """
-        return self.map(lambda x: (f(x), x)).groupByKey(numPartitions)
+        return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc)
 
     @ignore_unicode_prefix
     def pipe(self, command, env=None, checkCode=False):
@@ -1541,22 +1539,23 @@ def values(self):
         """
         return self.map(lambda x: x[1])
 
-    def reduceByKey(self, func, numPartitions=None):
+    def reduceByKey(self, func, numPartitions=None, partitionFunc=portable_hash):
         """
         Merge the values for each key using an associative reduce function.
 
         This will also perform the merging locally on each mapper before
         sending results to a reducer, similarly to a "combiner" in MapReduce.
 
-        Output will be hash-partitioned with C{numPartitions} partitions, or
+        Output will be partitioned with C{numPartitions} partitions, or
         the default parallelism level if C{numPartitions} is not specified.
+        Default partitioner is hash-partition.
 
         >>> from operator import add
         >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> sorted(rdd.reduceByKey(add).collect())
         [('a', 2), ('b', 1)]
         """
-        return self.combineByKey(lambda x: x, func, func, numPartitions)
+        return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc)
 
     def reduceByKeyLocally(self, func):
         """
@@ -1741,7 +1740,7 @@ def add_shuffle_key(split, iterator):
 
     # TODO: add control over map-side aggregation
     def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
-                     numPartitions=None):
+                     numPartitions=None, partitionFunc=portable_hash):
         """
         Generic function to combine the elements for each key using a custom
         set of aggregation functions.
@@ -1770,28 +1769,26 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
             numPartitions = self._defaultReducePartitions()
 
         serializer = self.ctx.serializer
-        spill = self._can_spill()
         memory = self._memory_limit()
         agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
 
         def combineLocally(iterator):
-            merger = ExternalMerger(agg, memory * 0.9, serializer) \
-                if spill else InMemoryMerger(agg)
+            merger = ExternalMerger(agg, memory * 0.9, serializer)
             merger.mergeValues(iterator)
             return merger.items()
 
         locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True)
-        shuffled = locally_combined.partitionBy(numPartitions)
+        shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
 
         def _mergeCombiners(iterator):
-            merger = ExternalMerger(agg, memory, serializer) \
-                if spill else InMemoryMerger(agg)
+            merger = ExternalMerger(agg, memory, serializer)
             merger.mergeCombiners(iterator)
             return merger.items()
 
         return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True)
 
-    def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
+    def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None,
+                       partitionFunc=portable_hash):
         """
         Aggregate the values of each key, using given combine functions and a neutral
         "zero value". This function can return a different result type, U, than the type
@@ -1805,9 +1802,9 @@ def createZero():
             return copy.deepcopy(zeroValue)
 
         return self.combineByKey(
-            lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions)
+            lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions, partitionFunc)
 
-    def foldByKey(self, zeroValue, func, numPartitions=None):
+    def foldByKey(self, zeroValue, func, numPartitions=None, partitionFunc=portable_hash):
         """
         Merge the values for each key using an associative function "func"
         and a neutral "zeroValue" which may be added to the result an
@@ -1822,16 +1819,14 @@ def foldByKey(self, zeroValue, func, numPartitions=None):
         def createZero():
             return copy.deepcopy(zeroValue)
 
-        return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions)
-
-    def _can_spill(self):
-        return self.ctx._conf.get("spark.shuffle.spill", "True").lower() == "true"
+        return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions,
+                                 partitionFunc)
 
     def _memory_limit(self):
         return _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m"))
 
     # TODO: support variant with custom partitioner
-    def groupByKey(self, numPartitions=None):
+    def groupByKey(self, numPartitions=None, partitionFunc=portable_hash):
         """
         Group the values for each key in the RDD into a single sequence.
         Hash-partitions the resulting RDD with numPartitions partitions.
@@ -1857,23 +1852,20 @@ def mergeCombiners(a, b):
             a.extend(b)
             return a
 
-        spill = self._can_spill()
         memory = self._memory_limit()
         serializer = self._jrdd_deserializer
         agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
 
         def combine(iterator):
-            merger = ExternalMerger(agg, memory * 0.9, serializer) \
-                if spill else InMemoryMerger(agg)
+            merger = ExternalMerger(agg, memory * 0.9, serializer)
             merger.mergeValues(iterator)
             return merger.items()
 
         locally_combined = self.mapPartitions(combine, preservesPartitioning=True)
-        shuffled = locally_combined.partitionBy(numPartitions)
+        shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
 
         def groupByKey(it):
-            merger = ExternalGroupBy(agg, memory, serializer)\
-                if spill else InMemoryMerger(agg)
+            merger = ExternalGroupBy(agg, memory, serializer)
             merger.mergeCombiners(it)
             return merger.items()
 
@@ -2192,6 +2184,9 @@ def lookup(self, key):
         [42]
         >>> sorted.lookup(1024)
         []
+        >>> rdd2 = sc.parallelize([(('a', 'b'), 'c')]).groupByKey()
+        >>> list(rdd2.lookup(('a', 'b'))[0])
+        ['c']
         """
         values = self.filter(lambda kv: kv[0] == key).values()
 
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 411b4dbf481f1..2a1326947f4f5 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -359,6 +359,7 @@ def _hack_namedtuple(cls):
     def __reduce__(self):
         return (_restore, (name, fields, tuple(self)))
     cls.__reduce__ = __reduce__
+    cls._is_namedtuple_ = True
     return cls
 
 
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index b8118bdb7ca76..e974cda9fc3e1 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -131,36 +131,6 @@ def items(self):
         raise NotImplementedError
 
 
-class InMemoryMerger(Merger):
-
-    """
-    In memory merger based on in-memory dict.
-    """
-
-    def __init__(self, aggregator):
-        Merger.__init__(self, aggregator)
-        self.data = {}
-
-    def mergeValues(self, iterator):
-        """ Combine the items by creator and combiner """
-        # speed up attributes lookup
-        d, creator = self.data, self.agg.createCombiner
-        comb = self.agg.mergeValue
-        for k, v in iterator:
-            d[k] = comb(d[k], v) if k in d else creator(v)
-
-    def mergeCombiners(self, iterator):
-        """ Merge the combined items by mergeCombiner """
-        # speed up attributes lookup
-        d, comb = self.data, self.agg.mergeCombiners
-        for k, v in iterator:
-            d[k] = comb(d[k], v) if k in d else v
-
-    def items(self):
-        """ Return the merged items ad iterator """
-        return iter(self.data.items())
-
-
 def _compressed_serializer(self, serializer=None):
     # always use PickleSerializer to simplify implementation
     ser = PickleSerializer()
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index ad9c891ba1c04..98eaf52866d23 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -44,21 +44,6 @@
 from __future__ import absolute_import
 
 
-def since(version):
-    """
-    A decorator that annotates a function to append the version of Spark the function was added.
-    """
-    import re
-    indent_p = re.compile(r'\n( +)')
-
-    def deco(f):
-        indents = indent_p.findall(f.__doc__)
-        indent = ' ' * (min(len(m) for m in indents) if indents else 0)
-        f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
-        return f
-    return deco
-
-
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.column import Column
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 0948f9b27cd38..9ca8e1f264cfa 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -22,9 +22,9 @@
     basestring = str
     long = int
 
+from pyspark import since
 from pyspark.context import SparkContext
 from pyspark.rdd import ignore_unicode_prefix
-from pyspark.sql import since
 from pyspark.sql.types import *
 
 __all__ = ["DataFrame", "Column", "SchemaRDD", "DataFrameNaFunctions",
@@ -91,6 +91,17 @@ def _(self):
     return _
 
 
+def _bin_func_op(name, reverse=False, doc="binary function"):
+    def _(self, other):
+        sc = SparkContext._active_spark_context
+        fn = getattr(sc._jvm.functions, name)
+        jc = other._jc if isinstance(other, Column) else _create_column_from_literal(other)
+        njc = fn(self._jc, jc) if not reverse else fn(jc, self._jc)
+        return Column(njc)
+    _.__doc__ = doc
+    return _
+
+
 def _bin_op(name, doc="binary operator"):
     """ Create a method for given binary operator
     """
@@ -151,6 +162,8 @@ def __init__(self, jc):
     __rdiv__ = _reverse_op("divide")
     __rtruediv__ = _reverse_op("divide")
     __rmod__ = _reverse_op("mod")
+    __pow__ = _bin_func_op("pow")
+    __rpow__ = _bin_func_op("pow", reverse=True)
 
     # logistic operators
     __eq__ = _bin_op("equalTo")
@@ -226,6 +239,9 @@ def __getattr__(self, item):
             raise AttributeError(item)
         return self.getField(item)
 
+    def __iter__(self):
+        raise TypeError("Column is not iterable")
+
     # string methods
     rlike = _bin_op("rlike")
     like = _bin_op("like")
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 0ef46c44644ab..89c8c6e0d94f1 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -26,9 +26,9 @@
 
 from py4j.protocol import Py4JError
 
+from pyspark import since
 from pyspark.rdd import RDD, _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
-from pyspark.sql import since
 from pyspark.sql.types import Row, StringType, StructType, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter
 from pyspark.sql.dataframe import DataFrame
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index e269ef4304f3f..033b31983ffac 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -26,11 +26,11 @@
 else:
     from itertools import imap as map
 
+from pyspark import since
 from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
 from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
-from pyspark.sql import since
 from pyspark.sql.types import _parse_datatype_json_string
 from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter
@@ -300,7 +300,10 @@ def take(self, num):
         >>> df.take(2)
         [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
         """
-        return self.limit(num).collect()
+        with SCCallSiteSync(self._sc) as css:
+            port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe(
+                self._jdf, num)
+        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
 
     @ignore_unicode_prefix
     @since(1.3)
@@ -567,7 +570,11 @@ def join(self, other, on=None, how=None):
         if on is None or len(on) == 0:
             jdf = self._jdf.join(other._jdf)
         elif isinstance(on[0], basestring):
-            jdf = self._jdf.join(other._jdf, self._jseq(on))
+            if how is None:
+                jdf = self._jdf.join(other._jdf, self._jseq(on), "inner")
+            else:
+                assert isinstance(how, basestring), "how should be basestring"
+                jdf = self._jdf.join(other._jdf, self._jseq(on), how)
         else:
             assert isinstance(on[0], Column), "on should be Column or list of Column"
             if len(on) > 1:
@@ -653,25 +660,25 @@ def describe(self, *cols):
         guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         >>> df.describe().show()
-        +-------+---+
-        |summary|age|
-        +-------+---+
-        |  count|  2|
-        |   mean|3.5|
-        | stddev|1.5|
-        |    min|  2|
-        |    max|  5|
-        +-------+---+
+        +-------+------------------+
+        |summary|               age|
+        +-------+------------------+
+        |  count|                 2|
+        |   mean|               3.5|
+        | stddev|2.1213203435596424|
+        |    min|                 2|
+        |    max|                 5|
+        +-------+------------------+
         >>> df.describe(['age', 'name']).show()
-        +-------+---+-----+
-        |summary|age| name|
-        +-------+---+-----+
-        |  count|  2|    2|
-        |   mean|3.5| null|
-        | stddev|1.5| null|
-        |    min|  2|Alice|
-        |    max|  5|  Bob|
-        +-------+---+-----+
+        +-------+------------------+-----+
+        |summary|               age| name|
+        +-------+------------------+-----+
+        |  count|                 2|    2|
+        |   mean|               3.5| null|
+        | stddev|2.1213203435596424| null|
+        |    min|                 2|Alice|
+        |    max|                 5|  Bob|
+        +-------+------------------+-----+
         """
         if len(cols) == 1 and isinstance(cols[0], list):
             cols = cols[0]
@@ -924,6 +931,8 @@ def dropDuplicates(self, subset=None):
         """Return a new :class:`DataFrame` with duplicate rows removed,
         optionally only considering certain columns.
 
+        :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
+
         >>> from pyspark.sql import Row
         >>> df = sc.parallelize([ \
             Row(name='Alice', age=5, height=80), \
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 4b74a501521a5..fa04f4cd83b6f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -24,12 +24,12 @@
 if sys.version < "3":
     from itertools import imap as map
 
-from pyspark import SparkContext
+from pyspark import since, SparkContext
 from pyspark.rdd import _prepare_for_python_RDD, ignore_unicode_prefix
 from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
-from pyspark.sql import since
 from pyspark.sql.types import StringType
 from pyspark.sql.column import Column, _to_java_column, _to_seq
+from pyspark.sql.dataframe import DataFrame
 
 
 def _create_function(name, doc=""):
@@ -190,6 +190,14 @@ def approxCountDistinct(col, rsd=None):
     return Column(jc)
 
 
+@since(1.6)
+def broadcast(df):
+    """Marks a DataFrame as small enough for use in broadcast joins."""
+
+    sc = SparkContext._active_spark_context
+    return DataFrame(sc._jvm.functions.broadcast(df._jdf), df.sql_ctx)
+
+
 @since(1.4)
 def coalesce(*cols):
     """Returns the first column that is not null.
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 04594d5a836ce..71c0bccc5eeff 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.rdd import ignore_unicode_prefix
-from pyspark.sql import since
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import *
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 3fa6895880a97..f43d8bf646a9e 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -22,8 +22,7 @@
 
 from py4j.java_gateway import JavaClass
 
-from pyspark import RDD
-from pyspark.sql import since
+from pyspark import RDD, since
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cd32e26c64f22..645133b2b2d84 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -50,16 +50,17 @@
 from pyspark.sql.utils import AnalysisException, IllegalArgumentException
 
 
-class UTC(datetime.tzinfo):
-    """UTC"""
-    ZERO = datetime.timedelta(0)
+class UTCOffsetTimezone(datetime.tzinfo):
+    """
+    Specifies timezone in UTC offset
+    """
+
+    def __init__(self, offset=0):
+        self.ZERO = datetime.timedelta(hours=offset)
 
     def utcoffset(self, dt):
         return self.ZERO
 
-    def tzname(self, dt):
-        return "UTC"
-
     def dst(self, dt):
         return self.ZERO
 
@@ -156,7 +157,7 @@ class DataTypeTests(unittest.TestCase):
     def test_data_type_eq(self):
         lt = LongType()
         lt2 = pickle.loads(pickle.dumps(LongType()))
-        self.assertEquals(lt, lt2)
+        self.assertEqual(lt, lt2)
 
     # regression test for SPARK-7978
     def test_decimal_type(self):
@@ -167,6 +168,11 @@ def test_decimal_type(self):
         t3 = DecimalType(8)
         self.assertNotEqual(t2, t3)
 
+    # regression test for SPARK-10392
+    def test_datetype_equal_zero(self):
+        dt = DateType()
+        self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1))
+
 
 class SQLTests(ReusedPySparkTestCase):
 
@@ -387,7 +393,7 @@ def test_infer_nested_schema(self):
                                    CustomRow(field1=2, field2="row2"),
                                    CustomRow(field1=3, field2="row3")])
         df = self.sqlCtx.inferSchema(rdd)
-        self.assertEquals(Row(field1=1, field2=u'row1'), df.first())
+        self.assertEqual(Row(field1=1, field2=u'row1'), df.first())
 
     def test_create_dataframe_from_objects(self):
         data = [MyObject(1, "1"), MyObject(2, "2")]
@@ -397,7 +403,7 @@ def test_create_dataframe_from_objects(self):
 
     def test_select_null_literal(self):
         df = self.sqlCtx.sql("select null as col")
-        self.assertEquals(Row(col=None), df.first())
+        self.assertEqual(Row(col=None), df.first())
 
     def test_apply_schema(self):
         from datetime import date, datetime
@@ -513,14 +519,14 @@ def test_apply_schema_with_udt(self):
                              StructField("point", ExamplePointUDT(), False)])
         df = self.sqlCtx.createDataFrame([row], schema)
         point = df.head().point
-        self.assertEquals(point, ExamplePoint(1.0, 2.0))
+        self.assertEqual(point, ExamplePoint(1.0, 2.0))
 
         row = (1.0, PythonOnlyPoint(1.0, 2.0))
         schema = StructType([StructField("label", DoubleType(), False),
                              StructField("point", PythonOnlyUDT(), False)])
         df = self.sqlCtx.createDataFrame([row], schema)
         point = df.head().point
-        self.assertEquals(point, PythonOnlyPoint(1.0, 2.0))
+        self.assertEqual(point, PythonOnlyPoint(1.0, 2.0))
 
     def test_udf_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
@@ -548,21 +554,21 @@ def test_parquet_with_udt(self):
         df0.write.parquet(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
         point = df1.head().point
-        self.assertEquals(point, ExamplePoint(1.0, 2.0))
+        self.assertEqual(point, ExamplePoint(1.0, 2.0))
 
         row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
         df0 = self.sqlCtx.createDataFrame([row])
         df0.write.parquet(output_dir, mode='overwrite')
         df1 = self.sqlCtx.parquetFile(output_dir)
         point = df1.head().point
-        self.assertEquals(point, PythonOnlyPoint(1.0, 2.0))
+        self.assertEqual(point, PythonOnlyPoint(1.0, 2.0))
 
     def test_column_operators(self):
         ci = self.df.key
         cs = self.df.value
         c = ci == cs
         self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column))
-        rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci)
+        rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci), (1 ** ci), (ci ** 1)
         self.assertTrue(all(isinstance(c, Column) for c in rcc))
         cb = [ci == 5, ci != 0, ci > 3, ci < 4, ci >= 0, ci <= 7]
         self.assertTrue(all(isinstance(c, Column) for c in cb))
@@ -820,8 +826,8 @@ def test_infer_long_type(self):
         output_dir = os.path.join(self.tempdir.name, "infer_long_type")
         df.saveAsParquetFile(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
-        self.assertEquals('a', df1.first().f1)
-        self.assertEquals(100000000000000, df1.first().f2)
+        self.assertEqual('a', df1.first().f1)
+        self.assertEqual(100000000000000, df1.first().f2)
 
         self.assertEqual(_infer_type(1), LongType())
         self.assertEqual(_infer_type(2**10), LongType())
@@ -841,13 +847,22 @@ def test_filter_with_datetime(self):
         self.assertEqual(0, df.filter(df.date > date).count())
         self.assertEqual(0, df.filter(df.time > time).count())
 
+    def test_filter_with_datetime_timezone(self):
+        dt1 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(0))
+        dt2 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(1))
+        row = Row(date=dt1)
+        df = self.sqlCtx.createDataFrame([row])
+        self.assertEqual(0, df.filter(df.date == dt2).count())
+        self.assertEqual(1, df.filter(df.date > dt2).count())
+        self.assertEqual(0, df.filter(df.date < dt2).count())
+
     def test_time_with_timezone(self):
         day = datetime.date.today()
         now = datetime.datetime.now()
         ts = time.mktime(now.timetuple())
         # class in __main__ is not serializable
-        from pyspark.sql.tests import UTC
-        utc = UTC()
+        from pyspark.sql.tests import UTCOffsetTimezone
+        utc = UTCOffsetTimezone()
         utcnow = datetime.datetime.utcfromtimestamp(ts)  # without microseconds
         # add microseconds to utcnow (keeping year,month,day,hour,minute,second)
         utcnow = datetime.datetime(*(utcnow.timetuple()[:6] + (now.microsecond, utc)))
@@ -1051,6 +1066,33 @@ def test_with_column_with_existing_name(self):
         keys = self.df.withColumn("key", self.df.key).select("key").collect()
         self.assertEqual([r.key for r in keys], list(range(100)))
 
+    # regression test for SPARK-10417
+    def test_column_iterator(self):
+
+        def foo():
+            for x in self.df.key:
+                break
+
+        self.assertRaises(TypeError, foo)
+
+    # add test for SPARK-10577 (test broadcast join hint)
+    def test_functions_broadcast(self):
+        from pyspark.sql.functions import broadcast
+
+        df1 = self.sqlCtx.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
+        df2 = self.sqlCtx.createDataFrame([(1, "1"), (2, "2")], ("key", "value"))
+
+        # equijoin - should be converted into broadcast join
+        plan1 = df1.join(broadcast(df2), "key")._jdf.queryExecution().executedPlan()
+        self.assertEqual(1, plan1.toString().count("BroadcastHashJoin"))
+
+        # no join key -- should not be a broadcast join
+        plan2 = df1.join(broadcast(df2))._jdf.queryExecution().executedPlan()
+        self.assertEqual(0, plan2.toString().count("BroadcastHashJoin"))
+
+        # planner should not crash without a join
+        broadcast(df1)._jdf.queryExecution().executedPlan()
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 94e581a78364c..1f86894855cbe 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -168,10 +168,12 @@ def needConversion(self):
         return True
 
     def toInternal(self, d):
-        return d and d.toordinal() - self.EPOCH_ORDINAL
+        if d is not None:
+            return d.toordinal() - self.EPOCH_ORDINAL
 
     def fromInternal(self, v):
-        return v and datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
+        if v is not None:
+            return datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
 
 
 class TimestampType(AtomicType):
@@ -1174,6 +1176,8 @@ class Row(tuple):
     >>> row = Row(name="Alice", age=11)
     >>> row
     Row(age=11, name='Alice')
+    >>> row['name'], row['age']
+    ('Alice', 11)
     >>> row.name, row.age
     ('Alice', 11)
 
@@ -1241,6 +1245,19 @@ def __call__(self, *args):
         """create new Row object"""
         return _create_row(self, args)
 
+    def __getitem__(self, item):
+        if isinstance(item, (int, slice)):
+            return super(Row, self).__getitem__(item)
+        try:
+            # it will be slow when it has many fields,
+            # but this will not be used in normal cases
+            idx = self.__fields__.index(item)
+            return super(Row, self).__getitem__(idx)
+        except IndexError:
+            raise KeyError(item)
+        except ValueError:
+            raise ValueError(item)
+
     def __getattr__(self, item):
         if item.startswith("__"):
             raise AttributeError(item)
@@ -1290,8 +1307,11 @@ def can_convert(self, obj):
 
     def convert(self, obj, gateway_client):
         Timestamp = JavaClass("java.sql.Timestamp", gateway_client)
-        return Timestamp(int(time.mktime(obj.timetuple())) * 1000 + obj.microsecond // 1000)
-
+        seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo
+                   else time.mktime(obj.timetuple()))
+        t = Timestamp(int(seconds) * 1000)
+        t.setNanos(obj.microsecond * 1000)
+        return t
 
 # datetime is a subclass of date, we should register DatetimeConverter first
 register_input_converter(DatetimeConverter())
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index eaf4d7e98620a..57bbe340bbd4d 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -17,8 +17,7 @@
 
 import sys
 
-from pyspark import SparkContext
-from pyspark.sql import since
+from pyspark import since, SparkContext
 from pyspark.sql.column import _to_seq, _to_java_column
 
 __all__ = ["Window", "WindowSpec"]
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 0fee3b2096826..03ea0b6d33c9d 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -131,6 +131,28 @@ def stdev(self):
     def sampleStdev(self):
         return sqrt(self.sampleVariance())
 
+    def asDict(self, sample=False):
+        """Returns the :class:`StatCounter` members as a ``dict``.
+
+        >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict()
+        {'count': 4L,
+         'max': 4.0,
+         'mean': 2.5,
+         'min': 1.0,
+         'stdev': 1.2909944487358056,
+         'sum': 10.0,
+         'variance': 1.6666666666666667}
+        """
+        return {
+            'count': self.count(),
+            'mean': self.mean(),
+            'sum': self.sum(),
+            'min': self.min(),
+            'max': self.max(),
+            'stdev': self.stdev() if sample else self.sampleStdev(),
+            'variance': self.variance() if sample else self.sampleVariance()
+        }
+
     def __repr__(self):
         return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" %
                 (self.count(), self.mean(), self.stdev(), self.max(), self.min()))
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index 4069d7a149986..a8c9ffc235b9e 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -240,6 +240,7 @@ def start(self):
     def awaitTermination(self, timeout=None):
         """
         Wait for the execution to stop.
+
         @param timeout: time to wait in seconds
         """
         if timeout is None:
@@ -252,6 +253,7 @@ def awaitTerminationOrTimeout(self, timeout):
         Wait for the execution to stop. Return `true` if it's stopped; or
         throw the reported error during the execution; or `false` if the
         waiting time elapsed before returning from the method.
+
         @param timeout: time to wait in seconds
         """
         self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))
diff --git a/python/pyspark/streaming/mqtt.py b/python/pyspark/streaming/mqtt.py
index f06598971c548..fa83006c36db6 100644
--- a/python/pyspark/streaming/mqtt.py
+++ b/python/pyspark/streaming/mqtt.py
@@ -31,6 +31,7 @@ def createStream(ssc, brokerUrl, topic,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
         """
         Create an input stream that pulls messages from a Mqtt Broker.
+
         :param ssc:  StreamingContext object
         :param brokerUrl:  Url of remote mqtt publisher
         :param topic:  topic name to subscribe to
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index cfea95b0dec71..e4e56fff3b3fc 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -693,7 +693,7 @@ def check_output(n):
 
         # Verify that getActiveOrCreate() returns active context
         self.setupCalled = False
-        self.assertEquals(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
+        self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
         self.assertFalse(self.setupCalled)
 
         # Verify that getActiveOrCreate() uses existing SparkContext
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 8bfed074c9052..63cc87e0c4b2c 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -62,7 +62,7 @@
     CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \
     PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \
     FlattenedValuesSerializer
-from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
+from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter
 from pyspark import shuffle
 from pyspark.profiler import BasicProfiler
 
@@ -95,17 +95,6 @@ def setUp(self):
                               lambda x, y: x.append(y) or x,
                               lambda x, y: x.extend(y) or x)
 
-    def test_in_memory(self):
-        m = InMemoryMerger(self.agg)
-        m.mergeValues(self.data)
-        self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
-
-        m = InMemoryMerger(self.agg)
-        m.mergeCombiners(map(lambda x_y: (x_y[0], [x_y[1]]), self.data))
-        self.assertEqual(sum(sum(v) for k, v in m.items()),
-                         sum(xrange(self.N)))
-
     def test_small_dataset(self):
         m = ExternalMerger(self.agg, 1000)
         m.mergeValues(self.data)
@@ -218,6 +207,11 @@ def test_namedtuple(self):
         p2 = loads(dumps(p1, 2))
         self.assertEqual(p1, p2)
 
+        from pyspark.cloudpickle import dumps
+        P2 = loads(dumps(P))
+        p3 = P2(1, 3)
+        self.assertEqual(p1, p3)
+
     def test_itemgetter(self):
         from operator import itemgetter
         ser = CloudPickleSerializer()
@@ -1982,6 +1976,26 @@ def test_statcounter_array(self):
         self.assertSequenceEqual([3.0, 3.0], s.max().tolist())
         self.assertSequenceEqual([1.0, 1.0], s.sampleStdev().tolist())
 
+        stats_dict = s.asDict()
+        self.assertEqual(3, stats_dict['count'])
+        self.assertSequenceEqual([2.0, 2.0], stats_dict['mean'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['min'].tolist())
+        self.assertSequenceEqual([3.0, 3.0], stats_dict['max'].tolist())
+        self.assertSequenceEqual([6.0, 6.0], stats_dict['sum'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['stdev'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['variance'].tolist())
+
+        stats_sample_dict = s.asDict(sample=True)
+        self.assertEqual(3, stats_dict['count'])
+        self.assertSequenceEqual([2.0, 2.0], stats_sample_dict['mean'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_sample_dict['min'].tolist())
+        self.assertSequenceEqual([3.0, 3.0], stats_sample_dict['max'].tolist())
+        self.assertSequenceEqual([6.0, 6.0], stats_sample_dict['sum'].tolist())
+        self.assertSequenceEqual(
+            [0.816496580927726, 0.816496580927726], stats_sample_dict['stdev'].tolist())
+        self.assertSequenceEqual(
+            [0.6666666666666666, 0.6666666666666666], stats_sample_dict['variance'].tolist())
+
 
 if __name__ == "__main__":
     if not _have_scipy:
diff --git a/python/test_support/sql/orc_partitioned/._SUCCESS.crc b/python/test_support/sql/orc_partitioned/._SUCCESS.crc
deleted file mode 100644
index 3b7b044936a89..0000000000000
Binary files a/python/test_support/sql/orc_partitioned/._SUCCESS.crc and /dev/null differ
diff --git a/repl/pom.xml b/repl/pom.xml
index a5a0f1fc2c857..5cf416a4a5448 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index be31eb2eda546..627148df80c11 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -35,7 +35,8 @@ object Main extends Logging {
   s.processArguments(List("-Yrepl-class-based",
     "-Yrepl-outdir", s"${outputDir.getAbsolutePath}",
     "-classpath", getAddedJars.mkString(File.pathSeparator)), true)
-  val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
+  // the creation of SecurityManager has to be lazy so SPARK_YARN_MODE is set if needed
+  lazy val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
   var sparkContext: SparkContext = _
   var sqlContext: SQLContext = _
   var interp = new SparkILoop // this is a public var because tests reset it.
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index bf609ff0f65fc..33d262558b1fc 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -118,5 +118,5 @@ object SparkILoop {
       }
     }
   }
-  def run(lines: List[String]): String = run(lines map (_ + "\n") mkString)
+  def run(lines: List[String]): String = run(lines.map(_ + "\n").mkString)
 }
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 68fdb4141cf27..64a0c71bbef2a 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -168,6 +168,14 @@ This file is divided into 3 sections:
     scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
   </check>
 
+  <!-- As of SPARK-10330 JobContext methods should not be called directly -->
+  <check customId="jobcontext" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
+    <parameters><parameter name="regex">^getConfiguration$|^getTaskAttemptID$</parameter></parameters>
+    <customMessage>Instead of calling .getConfiguration() or .getTaskAttemptID() directly,
+    use SparkHadoopUtil's getConfigurationFromJobContext() and getTaskAttemptIDFromTaskAttemptContext() methods.
+    </customMessage>
+  </check>
+
   <!-- ================================================================================ -->
   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
   <!-- ================================================================================ -->
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 75ab575dfde83..6cfd53e868f83 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/SQLUserDefinedType.java
similarity index 100%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/SQLUserDefinedType.java
rename to sql/catalyst/src/main/java/org/apache/spark/sql/types/SQLUserDefinedType.java
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
index 5898a5f93f381..2bac08eac4fe2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 private[sql] abstract class AbstractSparkSQLParser
   extends StandardTokenParsers with PackratParsers {
 
-  def parse(input: String): LogicalPlan = {
+  def parse(input: String): LogicalPlan = synchronized {
     // Initialize the Keywords.
     initLexical
     phrase(start)(new lexical.Scanner(input)) match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 966623ed017ba..f25591794abdb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -138,8 +138,13 @@ object CatalystTypeConverters {
 
   private case class UDTConverter(
       udt: UserDefinedType[_]) extends CatalystTypeConverter[Any, Any, Any] {
+    // toCatalyst (it calls toCatalystImpl) will do null check.
     override def toCatalystImpl(scalaValue: Any): Any = udt.serialize(scalaValue)
-    override def toScala(catalystValue: Any): Any = udt.deserialize(catalystValue)
+
+    override def toScala(catalystValue: Any): Any = {
+      if (catalystValue == null) null else udt.deserialize(catalystValue)
+    }
+
     override def toScalaImpl(row: InternalRow, column: Int): Any =
       toScala(row.get(column, udt.sqlType))
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala
index 554fb4eb25eb1..e21d3c05464b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ParserDialect.scala
@@ -61,7 +61,7 @@ abstract class ParserDialect {
  */
 private[spark] class DefaultParserDialect extends ParserDialect {
   @transient
-  protected val sqlParser = new SqlParser
+  protected val sqlParser = SqlParser
 
   override def parse(sqlText: String): LogicalPlan = {
     sqlParser.parse(sqlText)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index f2498861c9573..dfab2398857e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -37,9 +37,9 @@ import org.apache.spark.unsafe.types.CalendarInterval
  * This is currently included mostly for illustrative purposes.  Users wanting more complete support
  * for a SQL like language should checkout the HiveQL support in the sql/hive sub-project.
  */
-class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
+object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
-  def parseExpression(input: String): Expression = {
+  def parseExpression(input: String): Expression = synchronized {
     // Initialize the Keywords.
     initLexical
     phrase(projection)(new lexical.Scanner(input)) match {
@@ -48,7 +48,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     }
   }
 
-  def parseTableIdentifier(input: String): TableIdentifier = {
+  def parseTableIdentifier(input: String): TableIdentifier = synchronized {
     // Initialize the Keywords.
     initLexical
     phrase(tableIdentifier)(new lexical.Scanner(input)) match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 1a5de15c61f86..bf72d47ce1ea6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -378,6 +378,22 @@ class Analyzer(
         val newOrdering = resolveSortOrders(ordering, child, throws = false)
         Sort(newOrdering, global, child)
 
+      // A special case for Generate, because the output of Generate should not be resolved by
+      // ResolveReferences. Attributes in the output will be resolved by ResolveGenerate.
+      case g @ Generate(generator, join, outer, qualifier, output, child)
+        if child.resolved && !generator.resolved =>
+        val newG = generator transformUp {
+          case u @ UnresolvedAttribute(nameParts) =>
+            withPosition(u) { child.resolve(nameParts, resolver).getOrElse(u) }
+          case UnresolvedExtractValue(child, fieldExpr) =>
+            ExtractValue(child, fieldExpr, resolver)
+        }
+        if (newG.fastEquals(generator)) {
+          g
+        } else {
+          Generate(newG.asInstanceOf[Generator], join, outer, qualifier, output, child)
+        }
+
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressionsUp  {
@@ -560,42 +576,52 @@ class Analyzer(
           filter
         }
 
-      case sort @ Sort(sortOrder, global,
-             aggregate @ Aggregate(grouping, originalAggExprs, child))
-        if aggregate.resolved && !sort.resolved =>
+      case sort @ Sort(sortOrder, global, aggregate: Aggregate)
+        if aggregate.resolved =>
 
         // Try resolving the ordering as though it is in the aggregate clause.
         try {
-          val aliasedOrder = sortOrder.map(o => Alias(o.child, "aggOrder")())
-          val aggregatedOrdering = Aggregate(grouping, aliasedOrder, child)
-          val resolvedOperator: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
-          def resolvedAggregateOrdering = resolvedOperator.aggregateExpressions
-
-          // Expressions that have an aggregate can be pushed down.
-          val needsAggregate = resolvedAggregateOrdering.exists(containsAggregate)
-
-          // Attribute references, that are missing from the order but are present in the grouping
-          // expressions can also be pushed down.
-          val requiredAttributes = resolvedAggregateOrdering.map(_.references).reduce(_ ++ _)
-          val missingAttributes = requiredAttributes -- aggregate.outputSet
-          val validPushdownAttributes =
-            missingAttributes.filter(a => grouping.exists(a.semanticEquals))
-
-          // If resolution was successful and we see the ordering either has an aggregate in it or
-          // it is missing something that is projected away by the aggregate, add the ordering
-          // the original aggregate operator.
-          if (resolvedOperator.resolved && (needsAggregate || validPushdownAttributes.nonEmpty)) {
-            val evaluatedOrderings: Seq[SortOrder] = sortOrder.zip(resolvedAggregateOrdering).map {
-              case (order, evaluated) => order.copy(child = evaluated.toAttribute)
-            }
-            val aggExprsWithOrdering: Seq[NamedExpression] =
-              resolvedAggregateOrdering ++ originalAggExprs
+          val aliasedOrdering = sortOrder.map(o => Alias(o.child, "aggOrder")())
+          val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
+          val resolvedAggregate: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
+          val resolvedAliasedOrdering: Seq[Alias] =
+            resolvedAggregate.aggregateExpressions.asInstanceOf[Seq[Alias]]
+
+          // If we pass the analysis check, then the ordering expressions should only reference to
+          // aggregate expressions or grouping expressions, and it's safe to push them down to
+          // Aggregate.
+          checkAnalysis(resolvedAggregate)
+
+          val originalAggExprs = aggregate.aggregateExpressions.map(
+            CleanupAliases.trimNonTopLevelAliases(_).asInstanceOf[NamedExpression])
+
+          // If the ordering expression is same with original aggregate expression, we don't need
+          // to push down this ordering expression and can reference the original aggregate
+          // expression instead.
+          val needsPushDown = ArrayBuffer.empty[NamedExpression]
+          val evaluatedOrderings = resolvedAliasedOrdering.zip(sortOrder).map {
+            case (evaluated, order) =>
+              val index = originalAggExprs.indexWhere {
+                case Alias(child, _) => child semanticEquals evaluated.child
+                case other => other semanticEquals evaluated.child
+              }
+
+              if (index == -1) {
+                needsPushDown += evaluated
+                order.copy(child = evaluated.toAttribute)
+              } else {
+                order.copy(child = originalAggExprs(index).toAttribute)
+              }
+          }
 
+          // Since we don't rely on sort.resolved as the stop condition for this rule,
+          // we need to check this and prevent applying this rule multiple times
+          if (sortOrder == evaluatedOrderings) {
+            sort
+          } else {
             Project(aggregate.output,
               Sort(evaluatedOrderings, global,
-                aggregate.copy(aggregateExpressions = aggExprsWithOrdering)))
-          } else {
-            sort
+                aggregate.copy(aggregateExpressions = originalAggExprs ++ needsPushDown)))
           }
         } catch {
           // Attempting to resolve in the aggregate can result in ambiguity.  When this happens,
@@ -605,9 +631,7 @@ class Analyzer(
     }
 
     protected def containsAggregate(condition: Expression): Boolean = {
-      condition
-        .collect { case ae: AggregateExpression => ae }
-        .nonEmpty
+      condition.find(_.isInstanceOf[AggregateExpression]).isDefined
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index cd5a90d788151..e6122d92b763c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -168,6 +168,9 @@ object FunctionRegistry {
     expression[Last]("last"),
     expression[Max]("max"),
     expression[Min]("min"),
+    expression[Stddev]("stddev"),
+    expression[StddevPop]("stddev_pop"),
+    expression[StddevSamp]("stddev_samp"),
     expression[Sum]("sum"),
 
     // string functions
@@ -181,6 +184,7 @@ object FunctionRegistry {
     expression[FormatNumber]("format_number"),
     expression[GetJsonObject]("get_json_object"),
     expression[InitCap]("initcap"),
+    expression[JsonTuple]("json_tuple"),
     expression[Lower]("lcase"),
     expression[Lower]("lower"),
     expression[Length]("length"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 87c11abbad490..87a3845b2d9e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -297,6 +297,9 @@ object HiveTypeCoercion {
       case Sum(e @ StringType()) => Sum(Cast(e, DoubleType))
       case SumDistinct(e @ StringType()) => Sum(Cast(e, DoubleType))
       case Average(e @ StringType()) => Average(Cast(e, DoubleType))
+      case Stddev(e @ StringType()) => Stddev(Cast(e, DoubleType))
+      case StddevPop(e @ StringType()) => StddevPop(Cast(e, DoubleType))
+      case StddevSamp(e @ StringType()) => StddevSamp(Cast(e, DoubleType))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
index 35b74024a4cab..394be47a588b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index a7e3a49327655..699c4cc63d09a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -159,6 +159,9 @@ package object dsl {
     def lower(e: Expression): Expression = Lower(e)
     def sqrt(e: Expression): Expression = Sqrt(e)
     def abs(e: Expression): Expression = Abs(e)
+    def stddev(e: Expression): Expression = Stddev(e)
+    def stddev_pop(e: Expression): Expression = StddevPop(e)
+    def stddev_samp(e: Expression): Expression = StddevSamp(e)
 
     implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s: String = sym.name }
     // TODO more implicit class for literal?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 2db954257be35..f0bce388d959a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -22,7 +22,7 @@ import java.math.{BigDecimal => JavaBigDecimal}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{StringUtils, DateTimeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
@@ -140,7 +140,15 @@ case class Cast(child: Expression, dataType: DataType)
   // UDFToBoolean
   private[this] def castToBoolean(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, _.numBytes() != 0)
+      buildCast[UTF8String](_, s => {
+        if (StringUtils.isTrueString(s)) {
+          true
+        } else if (StringUtils.isFalseString(s)) {
+          false
+        } else {
+          null
+        }
+      })
     case TimestampType =>
       buildCast[Long](_, t => t != 0)
     case DateType =>
@@ -646,7 +654,17 @@ case class Cast(child: Expression, dataType: DataType)
 
   private[this] def castToBooleanCode(from: DataType): CastFunction = from match {
     case StringType =>
-      (c, evPrim, evNull) => s"$evPrim = $c.numBytes() != 0;"
+      val stringUtils = StringUtils.getClass.getName.stripSuffix("$")
+      (c, evPrim, evNull) =>
+        s"""
+          if ($stringUtils.isTrueString($c)) {
+            $evPrim = true;
+          } else if ($stringUtils.isFalseString($c)) {
+            $evPrim = false;
+          } else {
+            $evNull = true;
+          }
+        """
     case TimestampType =>
       (c, evPrim, evNull) => s"$evPrim = $c != 0;"
     case DateType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index a73024d6adba1..e7f8104d439c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
+import java.lang.{Long => JLong}
+import java.util
+
+import com.clearspring.analytics.hash.MurmurHash
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
@@ -249,6 +254,149 @@ case class Min(child: Expression) extends AlgebraicAggregate {
   override val evaluateExpression = min
 }
 
+// Compute the sample standard deviation of a column
+case class Stddev(child: Expression) extends StddevAgg(child) {
+
+  override def isSample: Boolean = true
+  override def prettyName: String = "stddev"
+}
+
+// Compute the population standard deviation of a column
+case class StddevPop(child: Expression) extends StddevAgg(child) {
+
+  override def isSample: Boolean = false
+  override def prettyName: String = "stddev_pop"
+}
+
+// Compute the sample standard deviation of a column
+case class StddevSamp(child: Expression) extends StddevAgg(child) {
+
+  override def isSample: Boolean = true
+  override def prettyName: String = "stddev_samp"
+}
+
+// Compute standard deviation based on online algorithm specified here:
+// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+abstract class StddevAgg(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  def isSample: Boolean
+
+  // Return data type.
+  override def dataType: DataType = resultType
+
+  // Expected input data type.
+  // TODO: Right now, we replace old aggregate functions (based on AggregateExpression1) to the
+  // new version at planning time (after analysis phase). For now, NullType is added at here
+  // to make it resolved when we have cases like `select stddev(null)`.
+  // We can use our analyzer to cast NullType to the default data type of the NumericType once
+  // we remove the old aggregate functions. Then, we will not need NullType at here.
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType))
+
+  private val resultType = DoubleType
+
+  private val preCount = AttributeReference("preCount", resultType)()
+  private val currentCount = AttributeReference("currentCount", resultType)()
+  private val preAvg = AttributeReference("preAvg", resultType)()
+  private val currentAvg = AttributeReference("currentAvg", resultType)()
+  private val currentMk = AttributeReference("currentMk", resultType)()
+
+  override val bufferAttributes = preCount :: currentCount :: preAvg ::
+                                  currentAvg :: currentMk :: Nil
+
+  override val initialValues = Seq(
+    /* preCount = */ Cast(Literal(0), resultType),
+    /* currentCount = */ Cast(Literal(0), resultType),
+    /* preAvg = */ Cast(Literal(0), resultType),
+    /* currentAvg = */ Cast(Literal(0), resultType),
+    /* currentMk = */ Cast(Literal(0), resultType)
+  )
+
+  override val updateExpressions = {
+
+    // update average
+    // avg = avg + (value - avg)/count
+    def avgAdd: Expression = {
+      currentAvg + ((Cast(child, resultType) - currentAvg) / currentCount)
+    }
+
+    // update sum of square of difference from mean
+    // Mk = Mk + (value - preAvg) * (value - updatedAvg)
+    def mkAdd: Expression = {
+      val delta1 = Cast(child, resultType) - preAvg
+      val delta2 = Cast(child, resultType) - currentAvg
+      currentMk + (delta1 * delta2)
+    }
+
+    Seq(
+      /* preCount = */ If(IsNull(child), preCount, currentCount),
+      /* currentCount = */ If(IsNull(child), currentCount,
+                           Add(currentCount, Cast(Literal(1), resultType))),
+      /* preAvg = */ If(IsNull(child), preAvg, currentAvg),
+      /* currentAvg = */ If(IsNull(child), currentAvg, avgAdd),
+      /* currentMk = */ If(IsNull(child), currentMk, mkAdd)
+    )
+  }
+
+  override val mergeExpressions = {
+
+    // count merge
+    def countMerge: Expression = {
+      currentCount.left + currentCount.right
+    }
+
+    // average merge
+    def avgMerge: Expression = {
+      ((currentAvg.left * preCount) + (currentAvg.right * currentCount.right)) /
+      (preCount + currentCount.right)
+    }
+
+    // update sum of square differences
+    def mkMerge: Expression = {
+      val avgDelta = currentAvg.right - preAvg
+      val mkDelta = (avgDelta * avgDelta) * (preCount * currentCount.right) /
+        (preCount + currentCount.right)
+
+      currentMk.left + currentMk.right + mkDelta
+    }
+
+    Seq(
+      /* preCount = */ If(IsNull(currentCount.left),
+                         Cast(Literal(0), resultType), currentCount.left),
+      /* currentCount = */ If(IsNull(currentCount.left), currentCount.right,
+                             If(IsNull(currentCount.right), currentCount.left, countMerge)),
+      /* preAvg = */ If(IsNull(currentAvg.left), Cast(Literal(0), resultType), currentAvg.left),
+      /* currentAvg = */ If(IsNull(currentAvg.left), currentAvg.right,
+                           If(IsNull(currentAvg.right), currentAvg.left, avgMerge)),
+      /* currentMk = */ If(IsNull(currentMk.left), currentMk.right,
+                          If(IsNull(currentMk.right), currentMk.left, mkMerge))
+    )
+  }
+
+  override val evaluateExpression = {
+    // when currentCount == 0, return null
+    // when currentCount == 1, return 0
+    // when currentCount >1
+    // stddev_samp = sqrt (currentMk/(currentCount -1))
+    // stddev_pop = sqrt (currentMk/currentCount)
+    val varCol = {
+      if (isSample) {
+        currentMk / Cast((currentCount - Cast(Literal(1), resultType)), resultType)
+      }
+      else {
+        currentMk / currentCount
+      }
+    }
+
+    If(EqualTo(currentCount, Cast(Literal(0), resultType)), Cast(Literal(null), resultType),
+      If(EqualTo(currentCount, Cast(Literal(1), resultType)), Cast(Literal(0), resultType),
+        Cast(Sqrt(varCol), resultType)))
+  }
+}
+
 case class Sum(child: Expression) extends AlgebraicAggregate {
 
   override def children: Seq[Expression] = child :: Nil
@@ -302,3 +450,397 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
 
   override val evaluateExpression = Cast(currentSum, resultType)
 }
+
+// scalastyle:off
+/**
+ * HyperLogLog++ (HLL++) is a state of the art cardinality estimation algorithm. This class
+ * implements the dense version of the HLL++ algorithm as an Aggregate Function.
+ *
+ * This implementation has been based on the following papers:
+ * HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm
+ * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
+ *
+ * HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation
+ * Algorithm
+ * http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en/us/pubs/archive/40671.pdf
+ *
+ * Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality
+ * Estimation Algorithm
+ * https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen#
+ *
+ * @param child to estimate the cardinality of.
+ * @param relativeSD the maximum estimation error allowed.
+ */
+// scalastyle:on
+case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
+    extends AggregateFunction2 {
+  import HyperLogLogPlusPlus._
+
+  /**
+   * HLL++ uses 'p' bits for addressing. The more addressing bits we use, the more precise the
+   * algorithm will be, and the more memory it will require. The 'p' value is based on the relative
+   * error requested.
+   *
+   * HLL++ requires that we use at least 4 bits of addressing space (a minimum precision of 27%).
+   *
+   * This method rounds up to the nearest integer. This means that the error is always equal to or
+   * lower than the requested error. Use the <code>trueRsd</code> method to get the actual RSD
+   * value.
+   */
+  private[this] val p = Math.ceil(2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)).toInt
+
+  require(p >= 4, "HLL++ requires at least 4 bits for addressing. " +
+    "Use a lower error, at most 27%.")
+
+  /**
+   * Shift used to extract the index of the register from the hashed value.
+   *
+   * This assumes the use of 64-bit hashcodes.
+   */
+  private[this] val idxShift = JLong.SIZE - p
+
+  /**
+   * Value to pad the 'w' value with before the number of leading zeros is determined.
+   */
+  private[this] val wPadding = 1L << (p - 1)
+
+  /**
+   * The number of registers used.
+   */
+  private[this] val m = 1 << p
+
+  /**
+   * The pre-calculated combination of: alpha * m * m
+   *
+   * 'alpha' corrects the raw cardinality estimate 'Z'. See the FlFuGaMe07 paper for its
+   * derivation.
+   */
+  private[this] val alphaM2 = p match {
+    case 4 => 0.673d * m * m
+    case 5 => 0.697d * m * m
+    case 6 => 0.709d * m * m
+    case _ => (0.7213d / (1.0d + 1.079d / m)) * m * m
+  }
+
+  /**
+   * The number of words used to store the registers. We use Longs for storage because this is the
+   * most compact way of storage; Spark aligns to 8-byte words or uses Long wrappers.
+   *
+   * We only store whole registers per word in order to prevent overly complex bitwise operations.
+   * In practice this means we only use 60 out of 64 bits.
+   */
+  private[this] val numWords = m / REGISTERS_PER_WORD + 1
+
+  def children: Seq[Expression] = Seq(child)
+
+  def nullable: Boolean = false
+
+  def dataType: DataType = LongType
+
+  def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  def bufferSchema: StructType = StructType.fromAttributes(bufferAttributes)
+
+  def cloneBufferAttributes: Seq[Attribute] = bufferAttributes.map(_.newInstance())
+
+  /** Allocate enough words to store all registers. */
+  val bufferAttributes: Seq[AttributeReference] = Seq.tabulate(numWords) { i =>
+    AttributeReference(s"MS[$i]", LongType)()
+  }
+
+  /** Fill all words with zeros. */
+  def initialize(buffer: MutableRow): Unit = {
+    var word = 0
+    while (word < numWords) {
+      buffer.setLong(mutableBufferOffset + word, 0)
+      word += 1
+    }
+  }
+
+  /**
+   * Update the HLL++ buffer.
+   *
+   * Variable names in the HLL++ paper match variable names in the code.
+   */
+  def update(buffer: MutableRow, input: InternalRow): Unit = {
+    val v = child.eval(input)
+    if (v != null) {
+      // Create the hashed value 'x'.
+      val x = MurmurHash.hash64(v)
+
+      // Determine the index of the register we are going to use.
+      val idx = (x >>> idxShift).toInt
+
+      // Determine the number of leading zeros in the remaining bits 'w'.
+      val pw = JLong.numberOfLeadingZeros((x << p) | wPadding) + 1L
+
+      // Get the word containing the register we are interested in.
+      val wordOffset = idx / REGISTERS_PER_WORD
+      val word = buffer.getLong(mutableBufferOffset + wordOffset)
+
+      // Extract the M[J] register value from the word.
+      val shift = REGISTER_SIZE * (idx - (wordOffset * REGISTERS_PER_WORD))
+      val mask = REGISTER_WORD_MASK << shift
+      val Midx = (word & mask) >>> shift
+
+      // Assign the maximum number of leading zeros to the register.
+      if (pw > Midx) {
+        buffer.setLong(mutableBufferOffset + wordOffset, (word & ~mask) | (pw << shift))
+      }
+    }
+  }
+
+  /**
+   * Merge the HLL buffers by iterating through the registers in both buffers and select the
+   * maximum number of leading zeros for each register.
+   */
+  def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    var idx = 0
+    var wordOffset = 0
+    while (wordOffset < numWords) {
+      val word1 = buffer1.getLong(mutableBufferOffset + wordOffset)
+      val word2 = buffer2.getLong(inputBufferOffset + wordOffset)
+      var word = 0L
+      var i = 0
+      var mask = REGISTER_WORD_MASK
+      while (idx < m && i < REGISTERS_PER_WORD) {
+        word |= Math.max(word1 & mask, word2 & mask)
+        mask <<= REGISTER_SIZE
+        i += 1
+        idx += 1
+      }
+      buffer1.setLong(mutableBufferOffset + wordOffset, word)
+      wordOffset += 1
+    }
+  }
+
+  /**
+   * Estimate the bias using the raw estimates with their respective biases from the HLL++
+   * appendix. We currently use KNN interpolation to determine the bias (as suggested in the
+   * paper).
+   */
+  def estimateBias(e: Double): Double = {
+    val estimates = RAW_ESTIMATE_DATA(p - 4)
+    val numEstimates = estimates.length
+
+    // The estimates are sorted so we can use a binary search to find the index of the
+    // interpolation estimate closest to the current estimate.
+    val nearestEstimateIndex = util.Arrays.binarySearch(estimates, 0, numEstimates, e) match {
+      case ix if ix < 0 => -(ix + 1)
+      case ix => ix
+    }
+
+    // Use square of the difference between the current estimate and the estimate at the given
+    // index as distance metric.
+    def distance(i: Int): Double = {
+      val diff = e - estimates(i)
+      diff * diff
+    }
+
+    // Keep moving bounds as long as the the (exclusive) high bound is closer to the estimate than
+    // the lower (inclusive) bound.
+    var low = math.max(nearestEstimateIndex - K + 1, 0)
+    var high = math.min(low + K, numEstimates)
+    while (high < numEstimates && distance(high) < distance(low)) {
+      low += 1
+      high += 1
+    }
+
+    // Calculate the sum of the biases in low-high interval.
+    val biases = BIAS_DATA(p - 4)
+    var i = low
+    var biasSum = 0.0
+    while (i < high) {
+      biasSum += biases(i)
+      i += 1
+    }
+
+    // Calculate the bias.
+    biasSum / (high - low)
+  }
+
+  /**
+   * Compute the HyperLogLog estimate.
+   *
+   * Variable names in the HLL++ paper match variable names in the code.
+   */
+  def eval(buffer: InternalRow): Any = {
+    // Compute the inverse of indicator value 'z' and count the number of zeros 'V'.
+    var zInverse = 0.0d
+    var V = 0.0d
+    var idx = 0
+    var wordOffset = 0
+    while (wordOffset < numWords) {
+      val word = buffer.getLong(mutableBufferOffset + wordOffset)
+      var i = 0
+      var shift = 0
+      while (idx < m && i < REGISTERS_PER_WORD) {
+        val Midx = (word >>> shift) & REGISTER_WORD_MASK
+        zInverse += 1.0 / (1 << Midx)
+        if (Midx == 0) {
+          V += 1.0d
+        }
+        shift += REGISTER_SIZE
+        i += 1
+        idx += 1
+      }
+      wordOffset += 1
+    }
+
+    // We integrate two steps from the paper:
+    // val Z = 1.0d / zInverse
+    // val E = alphaM2 * Z
+    @inline
+    def EBiasCorrected = alphaM2 / zInverse match {
+      case e if p < 19 && e < 5.0d * m => e - estimateBias(e)
+      case e => e
+    }
+
+    // Estimate the cardinality.
+    val estimate = if (V > 0) {
+      // Use linear counting for small cardinality estimates.
+      val H = m * Math.log(m / V)
+      if (H <= THRESHOLDS(p - 4)) {
+        H
+      } else {
+        EBiasCorrected
+      }
+    } else {
+      EBiasCorrected
+    }
+
+    // Round to the nearest long value.
+    Math.round(estimate)
+  }
+
+  /**
+   * The <code>rsd</code> of HLL++ is always equal to or better than the <code>rsd</code> requested.
+   * This method returns the <code>rsd</code> this instance actually guarantees.
+   *
+   * @return the actual <code>rsd</code>.
+   */
+  def trueRsd: Double = 1.04 / math.sqrt(m)
+}
+
+// scalastyle:off
+/**
+ * Constants used in the implementation of the HyperLogLogPlusPlus aggregate function.
+ *
+ * See the Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art
+ * Cardinality (https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen)
+ * for more information.
+ */
+// scalastyle:on
+object HyperLogLogPlusPlus {
+  /**
+   * The size of a word used for storing registers: 64 Bits.
+   */
+  val WORD_SIZE = java.lang.Long.SIZE
+
+  /**
+   * The number of bits that is required per register.
+   *
+   * This number is determined by the maximum number of leading binary zeros a hashcode can
+   * produce. This is equal to the number of bits the hashcode returns. The current
+   * implementation uses a 64-bit hashcode, this means 6-bits are (at most) needed to store the
+   * number of leading zeros.
+   */
+  val REGISTER_SIZE = 6
+
+  /**
+   * Value used to mask a register stored in a word.
+   */
+  val REGISTER_WORD_MASK: Long = (1 << REGISTER_SIZE) - 1
+
+  /**
+   * The number of registers which can be stored in one word.
+   */
+  val REGISTERS_PER_WORD = WORD_SIZE / REGISTER_SIZE
+
+  /**
+   * Number of points used for interpolating the bias value.
+   */
+  val K = 6
+
+  // Sacrificing style for readability here.
+  // scalastyle:off
+
+  /**
+   * Thresholds which decide if the linear counting or the regular algorithm is used.
+   */
+  val THRESHOLDS = Array[Double](10, 20, 40, 80, 220, 400, 900, 1800, 3100, 6500, 15500, 20000, 50000, 120000, 350000)
+
+  /**
+   * Lookup table used to find the (index of the) bias correction for a given precision (exact)
+   * and estimate (nearest).
+   */
+  val RAW_ESTIMATE_DATA = Array(
+    // precision 4
+    Array(11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394),
+    // precision 5
+    Array(23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852),
+    // precision 6
+    Array(46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858),
+    // precision 7
+    Array(92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102),
+    // precision 8
+    Array(184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192),
+    // precision 9
+    Array(369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768),
+    // precision 10
+    Array(738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828),
+    // precision 11
+    Array(1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176),
+    // precision 12
+    Array(2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22),
+    // precision 13
+    Array(5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424),
+    // precision 14
+    Array(11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884),
+    // precision 15
+    Array(23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842),
+    // precision 16
+    Array(47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542),
+    // precision 17
+    Array(94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845),
+    // precision 18
+    Array(189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691)
+  )
+
+  /**
+   * Bias corrections given a precision and the index of the raw estimate table.
+   */
+  val BIAS_DATA = Array(
+    // precision 4
+    Array(10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606),
+    // precision 5
+    Array(22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014),
+    // precision 6
+    Array(45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983),
+    // precision 7
+    Array(91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996),
+    // precision 8
+    Array(183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996),
+    // precision 9
+    Array(368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997),
+    // precision 10
+    Array(737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041),
+    // precision 11
+    Array(1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993),
+    // precision 12
+    Array(2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988),
+    // precision 13
+    Array(5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972),
+    // precision 14
+    Array(11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038),
+    // precision 15
+    Array(23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005),
+    // precision 16
+    Array(47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001),
+    // precision 17
+    Array(94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028),
+    // precision 18
+    Array(189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892)
+  )
+  // scalastyle:on
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 576d8c7a3a68a..d8699533cd156 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -169,6 +168,12 @@ abstract class AggregateFunction2
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+
+  /**
+   * Indicates if this function supports partial aggregation.
+   * Currently Hive UDAF is the only one that doesn't support partial aggregation.
+   */
+  def supportsPartial: Boolean = true
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index 4a43318a95490..f656ccf13b156 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -85,6 +85,24 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = false)
 
+        case expressions.Stddev(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Stddev(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.StddevPop(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.StddevPop(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.StddevSamp(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.StddevSamp(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
         case expressions.Sum(child) =>
           aggregate.AggregateExpression2(
             aggregateFunction = aggregate.Sum(child),
@@ -96,6 +114,12 @@ object Utils {
             aggregateFunction = aggregate.Sum(child),
             mode = aggregate.Complete,
             isDistinct = true)
+
+        case expressions.ApproxCountDistinct(child, rsd) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.HyperLogLogPlusPlus(child, rsd),
+            mode = aggregate.Complete,
+            isDistinct = false)
       }
       // Check if there is any expressions.AggregateExpression1 left.
       // If so, we cannot convert this plan.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 5e8298aaaa9cb..f1c47f39043c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -691,3 +691,248 @@ case class LastFunction(expr: Expression, base: AggregateExpression1) extends Ag
     result
   }
 }
+
+// Compute standard deviation based on online algorithm specified here:
+// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+abstract class StddevAgg1(child: Expression) extends UnaryExpression with PartialAggregate1 {
+  override def nullable: Boolean = true
+  override def dataType: DataType = DoubleType
+
+  def isSample: Boolean
+
+  override def asPartial: SplitEvaluation = {
+    val partialStd = Alias(ComputePartialStd(child), "PartialStddev")()
+    SplitEvaluation(MergePartialStd(partialStd.toAttribute, isSample), partialStd :: Nil)
+  }
+
+  override def newInstance(): StddevFunction = new StddevFunction(child, this, isSample)
+
+  override def checkInputDataTypes(): TypeCheckResult =
+    TypeUtils.checkForNumericExpr(child.dataType, "function stddev")
+
+}
+
+// Compute the sample standard deviation of a column
+case class Stddev(child: Expression) extends StddevAgg1(child) {
+
+  override def toString: String = s"STDDEV($child)"
+  override def isSample: Boolean = true
+}
+
+// Compute the population standard deviation of a column
+case class StddevPop(child: Expression) extends StddevAgg1(child) {
+
+  override def toString: String = s"STDDEV_POP($child)"
+  override def isSample: Boolean = false
+}
+
+// Compute the sample standard deviation of a column
+case class StddevSamp(child: Expression) extends StddevAgg1(child) {
+
+  override def toString: String = s"STDDEV_SAMP($child)"
+  override def isSample: Boolean = true
+}
+
+case class ComputePartialStd(child: Expression) extends UnaryExpression with AggregateExpression1 {
+    def this() = this(null)
+
+    override def children: Seq[Expression] = child :: Nil
+    override def nullable: Boolean = false
+    override def dataType: DataType = ArrayType(DoubleType)
+    override def toString: String = s"computePartialStddev($child)"
+    override def newInstance(): ComputePartialStdFunction =
+      new ComputePartialStdFunction(child, this)
+}
+
+case class ComputePartialStdFunction (
+    expr: Expression,
+    base: AggregateExpression1
+) extends AggregateFunction1 {
+  def this() = this(null, null)  // Required for serialization
+
+  private val computeType = DoubleType
+  private val zero = Cast(Literal(0), computeType)
+  private var partialCount: Long = 0L
+
+  // the mean of data processed so far
+  private val partialAvg: MutableLiteral = MutableLiteral(zero.eval(null), computeType)
+
+  // update average based on this formula:
+  // avg = avg + (value - avg)/count
+  private def avgAddFunction (value: Literal): Expression = {
+    val delta = Subtract(Cast(value, computeType), partialAvg)
+    Add(partialAvg, Divide(delta, Cast(Literal(partialCount), computeType)))
+  }
+
+  // the sum of squares of difference from mean
+  private val partialMk: MutableLiteral = MutableLiteral(zero.eval(null), computeType)
+
+  // update sum of square of difference from mean based on following formula:
+  // Mk = Mk + (value - preAvg) * (value - updatedAvg)
+  private def mkAddFunction(value: Literal, prePartialAvg: MutableLiteral): Expression = {
+    val delta1 = Subtract(Cast(value, computeType), prePartialAvg)
+    val delta2 = Subtract(Cast(value, computeType), partialAvg)
+    Add(partialMk, Multiply(delta1, delta2))
+  }
+
+  override def update(input: InternalRow): Unit = {
+    val evaluatedExpr = expr.eval(input)
+    if (evaluatedExpr != null) {
+      val exprValue = Literal.create(evaluatedExpr, expr.dataType)
+      val prePartialAvg = partialAvg.copy()
+      partialCount += 1
+      partialAvg.update(avgAddFunction(exprValue), input)
+      partialMk.update(mkAddFunction(exprValue, prePartialAvg), input)
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    new GenericArrayData(Array(Cast(Literal(partialCount), computeType).eval(null),
+        partialAvg.eval(null),
+        partialMk.eval(null)))
+  }
+}
+
+case class MergePartialStd(
+    child: Expression,
+    isSample: Boolean
+) extends UnaryExpression with AggregateExpression1 {
+  def this() = this(null, false) // required for serialization
+
+  override def children: Seq[Expression] = child:: Nil
+  override def nullable: Boolean = false
+  override def dataType: DataType = DoubleType
+  override def toString: String = s"MergePartialStd($child)"
+  override def newInstance(): MergePartialStdFunction = {
+    new MergePartialStdFunction(child, this, isSample)
+  }
+}
+
+case class MergePartialStdFunction(
+    expr: Expression,
+    base: AggregateExpression1,
+    isSample: Boolean
+) extends AggregateFunction1 {
+  def this() = this (null, null, false) // Required for serialization
+
+  private val computeType = DoubleType
+  private val zero = Cast(Literal(0), computeType)
+  private val combineCount = MutableLiteral(zero.eval(null), computeType)
+  private val combineAvg = MutableLiteral(zero.eval(null), computeType)
+  private val combineMk = MutableLiteral(zero.eval(null), computeType)
+
+  private def avgUpdateFunction(preCount: Expression,
+                                partialCount: Expression,
+                                partialAvg: Expression): Expression = {
+    Divide(Add(Multiply(combineAvg, preCount),
+               Multiply(partialAvg, partialCount)),
+           Add(preCount, partialCount))
+  }
+
+  override def update(input: InternalRow): Unit = {
+    val evaluatedExpr = expr.eval(input).asInstanceOf[ArrayData]
+
+    if (evaluatedExpr != null) {
+      val exprValue = evaluatedExpr.toArray(computeType)
+      val (partialCount, partialAvg, partialMk) =
+        (Literal.create(exprValue(0), computeType),
+         Literal.create(exprValue(1), computeType),
+         Literal.create(exprValue(2), computeType))
+
+      if (Cast(partialCount, LongType).eval(null).asInstanceOf[Long] > 0) {
+        val preCount = combineCount.copy()
+        combineCount.update(Add(combineCount, partialCount), input)
+
+        val preAvg = combineAvg.copy()
+        val avgDelta = Subtract(partialAvg, preAvg)
+        val mkDelta = Multiply(Multiply(avgDelta, avgDelta),
+                               Divide(Multiply(preCount, partialCount),
+                                      combineCount))
+
+        // update average based on following formula
+        // (combineAvg * preCount + partialAvg * partialCount) / (preCount + partialCount)
+        combineAvg.update(avgUpdateFunction(preCount, partialCount, partialAvg), input)
+
+        // update sum of square differences from mean based on following formula
+        // (combineMk + partialMk + (avgDelta * avgDelta) * (preCount * partialCount/combineCount)
+        combineMk.update(Add(combineMk, Add(partialMk, mkDelta)), input)
+      }
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val count: Long = Cast(combineCount, LongType).eval(null).asInstanceOf[Long]
+
+    if (count == 0) null
+    else if (count < 2) zero.eval(null)
+    else {
+      // when total count > 2
+      // stddev_samp = sqrt (combineMk/(combineCount -1))
+      // stddev_pop = sqrt (combineMk/combineCount)
+      val varCol = {
+        if (isSample) {
+          Divide(combineMk, Cast(Literal(count - 1), computeType))
+        }
+        else {
+          Divide(combineMk, Cast(Literal(count), computeType))
+        }
+      }
+      Sqrt(varCol).eval(null)
+    }
+  }
+}
+
+case class StddevFunction(
+    expr: Expression,
+    base: AggregateExpression1,
+    isSample: Boolean
+) extends AggregateFunction1 {
+
+  def this() = this(null, null, false) // Required for serialization
+
+  private val computeType = DoubleType
+  private var curCount: Long = 0L
+  private val zero = Cast(Literal(0), computeType)
+  private val curAvg = MutableLiteral(zero.eval(null), computeType)
+  private val curMk = MutableLiteral(zero.eval(null), computeType)
+
+  private def curAvgAddFunction(value: Literal): Expression = {
+    val delta = Subtract(Cast(value, computeType), curAvg)
+    Add(curAvg, Divide(delta, Cast(Literal(curCount), computeType)))
+  }
+  private def curMkAddFunction(value: Literal, preAvg: MutableLiteral): Expression = {
+    val delta1 = Subtract(Cast(value, computeType), preAvg)
+    val delta2 = Subtract(Cast(value, computeType), curAvg)
+    Add(curMk, Multiply(delta1, delta2))
+  }
+
+  override def update(input: InternalRow): Unit = {
+    val evaluatedExpr = expr.eval(input)
+    if (evaluatedExpr != null) {
+      val preAvg: MutableLiteral = curAvg.copy()
+      val exprValue = Literal.create(evaluatedExpr, expr.dataType)
+      curCount += 1L
+      curAvg.update(curAvgAddFunction(exprValue), input)
+      curMk.update(curMkAddFunction(exprValue, preAvg), input)
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    if (curCount == 0) null
+    else if (curCount < 2) zero.eval(null)
+    else {
+      // when total count > 2,
+      // stddev_samp = sqrt(curMk/(curCount - 1))
+      // stddev_pop = sqrt(curMk/curCount)
+      val varCol = {
+        if (isSample) {
+          Divide(curMk, Cast(Literal(curCount - 1), computeType))
+        }
+        else {
+          Divide(curMk, Cast(Literal(curCount), computeType))
+        }
+      }
+      Sqrt(varCol).eval(null)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index bf96248feaef7..da3103b4ebb6b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -84,11 +84,11 @@ class CodeGenContext {
   /**
    * Holding all the functions those will be added into generated class.
    */
-  val addedFuntions: mutable.Map[String, String] =
+  val addedFunctions: mutable.Map[String, String] =
     mutable.Map.empty[String, String]
 
   def addNewFunction(funcName: String, funcCode: String): Unit = {
-    addedFuntions += ((funcName, funcCode))
+    addedFunctions += ((funcName, funcCode))
   }
 
   final val JAVA_BOOLEAN = "boolean"
@@ -298,8 +298,8 @@ class CodeGenContext {
            |  $body
            |}
          """.stripMargin
-         addNewFunction(name, code)
-         name
+        addNewFunction(name, code)
+        name
       }
 
       functions.map(name => s"$name($row);").mkString("\n")
@@ -337,7 +337,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   }
 
   protected def declareAddedFunctions(ctx: CodeGenContext): String = {
-    ctx.addedFuntions.map { case (funcName, funcCode) => funcCode }.mkString("\n")
+    ctx.addedFunctions.map { case (funcName, funcCode) => funcCode }.mkString("\n")
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index b4d4df8934bd4..793023b9fbed3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp
 import org.apache.spark.sql.types.DecimalType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index c744e84d822e8..75524b568d685 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -48,7 +48,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     val columns = expressions.zipWithIndex.map {
       case (e, i) =>
         s"private ${ctx.javaType(e.dataType)} c$i = ${ctx.defaultValue(e.dataType)};\n"
-    }.mkString("\n      ")
+    }.mkString("\n")
 
     val initColumns = expressions.zipWithIndex.map {
       case (e, i) =>
@@ -67,18 +67,18 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val getCases = (0 until expressions.size).map { i =>
       s"case $i: return c$i;"
-    }.mkString("\n        ")
+    }.mkString("\n")
 
     val updateCases = expressions.zipWithIndex.map { case (e, i) =>
       s"case $i: { c$i = (${ctx.boxedType(e.dataType)})value; return;}"
-    }.mkString("\n        ")
+    }.mkString("\n")
 
     val specificAccessorFunctions = ctx.primitiveTypes.map { jt =>
       val cases = expressions.zipWithIndex.flatMap {
         case (e, i) if ctx.javaType(e.dataType) == jt =>
           Some(s"case $i: return c$i;")
         case _ => None
-      }.mkString("\n        ")
+      }.mkString("\n")
       if (cases.length > 0) {
         val getter = "get" + ctx.primitiveTypeName(jt)
         s"""
@@ -103,7 +103,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         case (e, i) if ctx.javaType(e.dataType) == jt =>
           Some(s"case $i: { c$i = value; return; }")
         case _ => None
-      }.mkString("\n        ")
+      }.mkString("\n")
       if (cases.length > 0) {
         val setter = "set" + ctx.primitiveTypeName(jt)
         s"""
@@ -152,7 +152,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     val copyColumns = expressions.zipWithIndex.map { case (e, i) =>
         s"""if (!nullBits[$i]) arr[$i] = c$i;"""
-    }.mkString("\n      ")
+    }.mkString("\n")
 
     val code = s"""
     public SpecificProjection generate($exprType[] expr) {
@@ -171,6 +171,8 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
       @Override
       public Object apply(Object r) {
+        // GenerateProjection does not work with UnsafeRows.
+        assert(!(r instanceof ${classOf[UnsafeRow].getName}));
         return new SpecificRow((InternalRow) r);
       }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index b570fe86db1aa..55562facf9652 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -134,7 +134,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     ctx.addMutableState("byte[]", buffer, s"this.$buffer = new byte[$fixedSize];")
     val cursor = ctx.freshName("cursor")
     ctx.addMutableState("int", cursor, s"this.$cursor = 0;")
-    val tmp = ctx.freshName("tmpBuffer")
+    val tmpBuffer = ctx.freshName("tmpBuffer")
 
     val convertedFields = inputTypes.zip(inputs).zipWithIndex.map { case ((dt, input), i) =>
       val ev = createConvertCode(ctx, input, dt)
@@ -144,10 +144,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           int $numBytes = $cursor + (${genAdditionalSize(dt, ev)});
           if ($buffer.length < $numBytes) {
             // This will not happen frequently, because the buffer is re-used.
-            byte[] $tmp = new byte[$numBytes * 2];
+            byte[] $tmpBuffer = new byte[$numBytes * 2];
             Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET,
-              $tmp, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
-            $buffer = $tmp;
+              $tmpBuffer, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
+            $buffer = $tmpBuffer;
           }
           $output.pointTo($buffer, Platform.BYTE_ARRAY_OFFSET, ${inputTypes.length}, $numBytes);
          """
@@ -206,67 +206,24 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     ctx.addMutableState("UnsafeArrayData", output, s"$output = new UnsafeArrayData();")
     val buffer = ctx.freshName("buffer")
     ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];")
+    val tmpBuffer = ctx.freshName("tmpBuffer")
     val outputIsNull = ctx.freshName("isNull")
-    val tmp = ctx.freshName("tmp")
     val numElements = ctx.freshName("numElements")
     val fixedSize = ctx.freshName("fixedSize")
     val numBytes = ctx.freshName("numBytes")
-    val elements = ctx.freshName("elements")
     val cursor = ctx.freshName("cursor")
     val index = ctx.freshName("index")
+    val elementName = ctx.freshName("elementName")
 
-    val element = GeneratedExpressionCode(
-      code = "",
-      isNull = s"$tmp.isNullAt($index)",
-      primitive = s"${ctx.getValue(tmp, elementType, index)}"
-    )
-    val convertedElement: GeneratedExpressionCode = createConvertCode(ctx, element, elementType)
-
-    // go through the input array to calculate how many bytes we need.
-    val calculateNumBytes = elementType match {
-      case _ if ctx.isPrimitiveType(elementType) =>
-        // Should we do word align?
-        val elementSize = elementType.defaultSize
-        s"""
-          $numBytes += $elementSize * $numElements;
-        """
-      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
-        s"""
-          $numBytes += 8 * $numElements;
-        """
-      case _ =>
-        val writer = getWriter(elementType)
-        val elementSize = s"$writer.getSize($elements[$index])"
-        // TODO(davies): avoid the copy
-        val unsafeType = elementType match {
-          case _: StructType => "UnsafeRow"
-          case _: ArrayType => "UnsafeArrayData"
-          case _: MapType => "UnsafeMapData"
-          case _ => ctx.javaType(elementType)
-        }
-        val copy = elementType match {
-          // We reuse the buffer during conversion, need copy it before process next element.
-          case _: StructType | _: ArrayType | _: MapType => ".copy()"
-          case _ => ""
-        }
-
-        val newElements = if (elementType == BinaryType) {
-          s"new byte[$numElements][]"
-        } else {
-          s"new $unsafeType[$numElements]"
-        }
-        s"""
-          final $unsafeType[] $elements = $newElements;
-          for (int $index = 0; $index < $numElements; $index++) {
-            ${convertedElement.code}
-            if (!${convertedElement.isNull}) {
-              $elements[$index] = ${convertedElement.primitive}$copy;
-              $numBytes += $elementSize;
-            }
-          }
-        """
+    val element = {
+      val code = s"${ctx.javaType(elementType)} $elementName = " +
+        s"${ctx.getValue(input.primitive, elementType, index)};"
+      val isNull = s"${input.primitive}.isNullAt($index)"
+      GeneratedExpressionCode(code, isNull, elementName)
     }
 
+    val convertedElement = createConvertCode(ctx, element, elementType)
+
     val writeElement = elementType match {
       case _ if ctx.isPrimitiveType(elementType) =>
         // Should we do word align?
@@ -292,41 +249,51 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           $cursor += $writer.write(
             $buffer,
             Platform.BYTE_ARRAY_OFFSET + $cursor,
-            $elements[$index]);
+            ${convertedElement.primitive});
         """
     }
 
-    val checkNull = elementType match {
-      case _ if ctx.isPrimitiveType(elementType) => s"${convertedElement.isNull}"
-      case t: DecimalType => s"$elements[$index] == null" +
-        s" || !$elements[$index].changePrecision(${t.precision}, ${t.scale})"
-      case _ => s"$elements[$index] == null"
+    val checkNull = convertedElement.isNull + (elementType match {
+      case t: DecimalType =>
+        s" || !${convertedElement.primitive}.changePrecision(${t.precision}, ${t.scale})"
+      case _ => ""
+    })
+
+    val elementSize = elementType match {
+      // Should we do word align for primitive types?
+      case _ if ctx.isPrimitiveType(elementType) => elementType.defaultSize.toString
+      case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => "8"
+      case _ =>
+        val writer = getWriter(elementType)
+        s"$writer.getSize(${convertedElement.primitive})"
     }
 
     val code = s"""
       ${input.code}
       final boolean $outputIsNull = ${input.isNull};
       if (!$outputIsNull) {
-        final ArrayData $tmp = ${input.primitive};
-        if ($tmp instanceof UnsafeArrayData) {
-          $output = (UnsafeArrayData) $tmp;
+        if (${input.primitive} instanceof UnsafeArrayData) {
+          $output = (UnsafeArrayData) ${input.primitive};
         } else {
-          final int $numElements = $tmp.numElements();
+          final int $numElements = ${input.primitive}.numElements();
           final int $fixedSize = 4 * $numElements;
           int $numBytes = $fixedSize;
 
-          $calculateNumBytes
-
-          if ($numBytes > $buffer.length) {
-            $buffer = new byte[$numBytes];
-          }
-
           int $cursor = $fixedSize;
           for (int $index = 0; $index < $numElements; $index++) {
+            ${convertedElement.code}
             if ($checkNull) {
               // If element is null, write the negative value address into offset region.
               Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, -$cursor);
             } else {
+              $numBytes += $elementSize;
+              if ($buffer.length < $numBytes) {
+                // This will not happen frequently, because the buffer is re-used.
+                byte[] $tmpBuffer = new byte[$numBytes * 2];
+                Platform.copyMemory($buffer, Platform.BYTE_ARRAY_OFFSET,
+                  $tmpBuffer, Platform.BYTE_ARRAY_OFFSET, $buffer.length);
+                $buffer = $tmpBuffer;
+              }
               Platform.putInt($buffer, Platform.BYTE_ARRAY_OFFSET + 4 * $index, $cursor);
               $writeElement
             }
@@ -350,29 +317,31 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       valueType: DataType): GeneratedExpressionCode = {
     val output = ctx.freshName("convertedMap")
     val outputIsNull = ctx.freshName("isNull")
-    val tmp = ctx.freshName("tmp")
-
-    val keyArray = GeneratedExpressionCode(
-      code = "",
-      isNull = "false",
-      primitive = s"$tmp.keyArray()"
-    )
-    val valueArray = GeneratedExpressionCode(
-      code = "",
-      isNull = "false",
-      primitive = s"$tmp.valueArray()"
-    )
-    val convertedKeys: GeneratedExpressionCode = createCodeForArray(ctx, keyArray, keyType)
-    val convertedValues: GeneratedExpressionCode = createCodeForArray(ctx, valueArray, valueType)
+    val keyArrayName = ctx.freshName("keyArrayName")
+    val valueArrayName = ctx.freshName("valueArrayName")
+
+    val keyArray = {
+      val code = s"ArrayData $keyArrayName = ${input.primitive}.keyArray();"
+      val isNull = "false"
+      GeneratedExpressionCode(code, isNull, keyArrayName)
+    }
+
+    val valueArray = {
+      val code = s"ArrayData $valueArrayName = ${input.primitive}.valueArray();"
+      val isNull = "false"
+      GeneratedExpressionCode(code, isNull, valueArrayName)
+    }
+
+    val convertedKeys = createCodeForArray(ctx, keyArray, keyType)
+    val convertedValues = createCodeForArray(ctx, valueArray, valueType)
 
     val code = s"""
       ${input.code}
       final boolean $outputIsNull = ${input.isNull};
       UnsafeMapData $output = null;
       if (!$outputIsNull) {
-        final MapData $tmp = ${input.primitive};
-        if ($tmp instanceof UnsafeMapData) {
-          $output = (UnsafeMapData) $tmp;
+        if (${input.primitive} instanceof UnsafeMapData) {
+          $output = (UnsafeMapData) ${input.primitive};
         } else {
           ${convertedKeys.code}
           ${convertedValues.code}
@@ -393,22 +362,22 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case t: StructType =>
       val output = ctx.freshName("convertedStruct")
       val outputIsNull = ctx.freshName("isNull")
-      val tmp = ctx.freshName("tmp")
       val fieldTypes = t.fields.map(_.dataType)
       val fieldEvals = fieldTypes.zipWithIndex.map { case (dt, i) =>
-        val getFieldCode = ctx.getValue(tmp, dt, i.toString)
-        val fieldIsNull = s"$tmp.isNullAt($i)"
-        GeneratedExpressionCode("", fieldIsNull, getFieldCode)
+        val fieldName = ctx.freshName("fieldName")
+        val code = s"${ctx.javaType(dt)} $fieldName = " +
+          s"${ctx.getValue(input.primitive, dt, i.toString)};"
+        val isNull = s"${input.primitive}.isNullAt($i)"
+        GeneratedExpressionCode(code, isNull, fieldName)
       }
-      val converter = createCodeForStruct(ctx, tmp, fieldEvals, fieldTypes)
+      val converter = createCodeForStruct(ctx, input.primitive, fieldEvals, fieldTypes)
       val code = s"""
         ${input.code}
          UnsafeRow $output = null;
          final boolean $outputIsNull = ${input.isNull};
          if (!$outputIsNull) {
-           final InternalRow $tmp = ${input.primitive};
-           if ($tmp instanceof UnsafeRow) {
-             $output = (UnsafeRow) $tmp;
+           if (${input.primitive} instanceof UnsafeRow) {
+             $output = (UnsafeRow) ${input.primitive};
            } else {
              ${converter.code}
              $output = ${converter.primitive};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 1c546719730b7..82eab5fb3d03a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -48,21 +48,22 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val arrayClass = classOf[GenericArrayData].getName
+    val values = ctx.freshName("values")
     s"""
       final boolean ${ev.isNull} = false;
-      final Object[] values = new Object[${children.size}];
+      final Object[] $values = new Object[${children.size}];
     """ +
       children.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            values[$i] = null;
+            $values[$i] = null;
           } else {
-            values[$i] = ${eval.primitive};
+            $values[$i] = ${eval.primitive};
           }
          """
       }.mkString("\n") +
-      s"final ${ctx.javaType(dataType)} ${ev.primitive} = new $arrayClass(values);"
+      s"final ArrayData ${ev.primitive} = new $arrayClass($values);"
   }
 
   override def prettyName: String = "array"
@@ -94,21 +95,23 @@ case class CreateStruct(children: Seq[Expression]) extends Expression {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val rowClass = classOf[GenericMutableRow].getName
+    val rowClass = classOf[GenericInternalRow].getName
+    val values = ctx.freshName("values")
     s"""
       boolean ${ev.isNull} = false;
-      final $rowClass ${ev.primitive} = new $rowClass(${children.size});
+      final Object[] $values = new Object[${children.size}];
     """ +
       children.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            ${ev.primitive}.update($i, null);
+            $values[$i] = null;
           } else {
-            ${ev.primitive}.update($i, ${eval.primitive});
+            $values[$i] = ${eval.primitive};
           }
          """
-      }.mkString("\n")
+      }.mkString("\n") +
+      s"final InternalRow ${ev.primitive} = new $rowClass($values);"
   }
 
   override def prettyName: String = "struct"
@@ -161,21 +164,23 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val rowClass = classOf[GenericMutableRow].getName
+    val rowClass = classOf[GenericInternalRow].getName
+    val values = ctx.freshName("values")
     s"""
       boolean ${ev.isNull} = false;
-      final $rowClass ${ev.primitive} = new $rowClass(${valExprs.size});
+      final Object[] $values = new Object[${valExprs.size}];
     """ +
       valExprs.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            ${ev.primitive}.update($i, null);
+            $values[$i] = null;
           } else {
-            ${ev.primitive}.update($i, ${eval.primitive});
+            $values[$i] = ${eval.primitive};
           }
          """
-      }.mkString("\n")
+      }.mkString("\n") +
+      s"final InternalRow ${ev.primitive} = new $rowClass($values);"
   }
 
   override def prettyName: String = "named_struct"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 23bfa18c94286..0770fab0ae901 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -21,8 +21,9 @@ import java.io.{StringWriter, ByteArrayOutputStream}
 
 import com.fasterxml.jackson.core._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.types.{StringType, DataType}
+import org.apache.spark.sql.types.{StructField, StructType, StringType, DataType}
 import org.apache.spark.unsafe.types.UTF8String
 
 import scala.util.parsing.combinator.RegexParsers
@@ -92,8 +93,8 @@ private[this] object JsonPathParser extends RegexParsers {
   }
 }
 
-private[this] object GetJsonObject {
-  private val jsonFactory = new JsonFactory()
+private[this] object SharedFactory {
+  val jsonFactory = new JsonFactory()
 
   // Enabled for Hive compatibility
   jsonFactory.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS)
@@ -106,7 +107,7 @@ private[this] object GetJsonObject {
 case class GetJsonObject(json: Expression, path: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
 
-  import GetJsonObject._
+  import SharedFactory._
   import PathInstruction._
   import WriteStyle._
   import com.fasterxml.jackson.core.JsonToken._
@@ -307,3 +308,161 @@ case class GetJsonObject(json: Expression, path: Expression)
     }
   }
 }
+
+case class JsonTuple(children: Seq[Expression])
+  extends Expression with CodegenFallback {
+
+  import SharedFactory._
+
+  override def nullable: Boolean = {
+    // a row is always returned
+    false
+  }
+
+  // if processing fails this shared value will be returned
+  @transient private lazy val nullRow: InternalRow =
+    new GenericInternalRow(Array.ofDim[Any](fieldExpressions.length))
+
+  // the json body is the first child
+  @transient private lazy val jsonExpr: Expression = children.head
+
+  // the fields to query are the remaining children
+  @transient private lazy val fieldExpressions: Seq[Expression] = children.tail
+
+  // eagerly evaluate any foldable the field names
+  @transient private lazy val foldableFieldNames: IndexedSeq[String] = {
+    fieldExpressions.map {
+      case expr if expr.foldable => expr.eval().asInstanceOf[UTF8String].toString
+      case _ => null
+    }.toIndexedSeq
+  }
+
+  // and count the number of foldable fields, we'll use this later to optimize evaluation
+  @transient private lazy val constantFields: Int = foldableFieldNames.count(_ != null)
+
+  override lazy val dataType: StructType = {
+    val fields = fieldExpressions.zipWithIndex.map {
+      case (_, idx) => StructField(
+        name = s"c$idx", // mirroring GenericUDTFJSONTuple.initialize
+        dataType = StringType,
+        nullable = true)
+    }
+
+    StructType(fields)
+  }
+
+  override def prettyName: String = "json_tuple"
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.length < 2) {
+      TypeCheckResult.TypeCheckFailure(s"$prettyName requires at least two arguments")
+    } else if (children.forall(child => StringType.acceptsType(child.dataType))) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(s"$prettyName requires that all arguments are strings")
+    }
+  }
+
+  override def eval(input: InternalRow): InternalRow = {
+    val json = jsonExpr.eval(input).asInstanceOf[UTF8String]
+    if (json == null) {
+      return nullRow
+    }
+
+    try {
+      val parser = jsonFactory.createParser(json.getBytes)
+
+      try {
+        parseRow(parser, input)
+      } finally {
+        parser.close()
+      }
+    } catch {
+      case _: JsonProcessingException =>
+        nullRow
+    }
+  }
+
+  private def parseRow(parser: JsonParser, input: InternalRow): InternalRow = {
+    // only objects are supported
+    if (parser.nextToken() != JsonToken.START_OBJECT) {
+      return nullRow
+    }
+
+    // evaluate the field names as String rather than UTF8String to
+    // optimize lookups from the json token, which is also a String
+    val fieldNames = if (constantFields == fieldExpressions.length) {
+      // typically the user will provide the field names as foldable expressions
+      // so we can use the cached copy
+      foldableFieldNames
+    } else if (constantFields == 0) {
+      // none are foldable so all field names need to be evaluated from the input row
+      fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String].toString)
+    } else {
+      // if there is a mix of constant and non-constant expressions
+      // prefer the cached copy when available
+      foldableFieldNames.zip(fieldExpressions).map {
+        case (null, expr) => expr.eval(input).asInstanceOf[UTF8String].toString
+        case (fieldName, _) => fieldName
+      }
+    }
+
+    val row = Array.ofDim[Any](fieldNames.length)
+
+    // start reading through the token stream, looking for any requested field names
+    while (parser.nextToken() != JsonToken.END_OBJECT) {
+      if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
+        // check to see if this field is desired in the output
+        val idx = fieldNames.indexOf(parser.getCurrentName)
+        if (idx >= 0) {
+          // it is, copy the child tree to the correct location in the output row
+          val output = new ByteArrayOutputStream()
+
+          // write the output directly to UTF8 encoded byte array
+          if (parser.nextToken() != JsonToken.VALUE_NULL) {
+            val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8)
+
+            try {
+              copyCurrentStructure(generator, parser)
+            } finally {
+              generator.close()
+            }
+
+            row(idx) = UTF8String.fromBytes(output.toByteArray)
+          }
+        }
+      }
+
+      // always skip children, it's cheap enough to do even if copyCurrentStructure was called
+      parser.skipChildren()
+    }
+
+    new GenericInternalRow(row)
+  }
+
+  private def copyCurrentStructure(generator: JsonGenerator, parser: JsonParser): Unit = {
+    parser.getCurrentToken match {
+      // if the user requests a string field it needs to be returned without enclosing
+      // quotes which is accomplished via JsonGenerator.writeRaw instead of JsonGenerator.write
+      case JsonToken.VALUE_STRING if parser.hasTextCharacters =>
+        // slight optimization to avoid allocating a String instance, though the characters
+        // still have to be decoded... Jackson doesn't have a way to access the raw bytes
+        generator.writeRaw(parser.getTextCharacters, parser.getTextOffset, parser.getTextLength)
+
+      case JsonToken.VALUE_STRING =>
+        // the normal String case, pass it through to the output without enclosing quotes
+        generator.writeRaw(parser.getText)
+
+      case JsonToken.VALUE_NULL =>
+        // a special case that needs to be handled outside of this method.
+        // if a requested field is null, the result must be null. the easiest
+        // way to achieve this is just by ignoring null tokens entirely
+        throw new IllegalStateException("Do not attempt to copy a null field")
+
+      case _ =>
+        // handle other types including objects, arrays, booleans and numbers
+        generator.copyCurrentStructure(parser)
+    }
+  }
+}
+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 15ceb9193a8c5..39de0e8f448de 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -52,7 +52,7 @@ abstract class LeafMathExpression(c: Double, name: String)
  * @param f The math function.
  * @param name The short name of the function
  */
-abstract class UnaryMathExpression(f: Double => Double, name: String)
+abstract class UnaryMathExpression(val f: Double => Double, name: String)
   extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(DoubleType)
@@ -152,7 +152,16 @@ case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN"
 
 case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
-case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL")
+case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL") {
+  override def dataType: DataType = LongType
+  protected override def nullSafeEval(input: Any): Any = {
+    f(input.asInstanceOf[Double]).toLong
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+  }
+}
 
 case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
@@ -195,7 +204,16 @@ case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
-case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR")
+case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR") {
+  override def dataType: DataType = LongType
+  protected override def nullSafeEval(input: Any): Any = {
+    f(input.asInstanceOf[Double]).toLong
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+  }
+}
 
 object Factorial {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 6f173b52ad9b9..5768c6087db32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -236,6 +236,14 @@ case class AttributeReference(
     }
   }
 
+  def withExprId(newExprId: ExprId): AttributeReference = {
+    if (exprId == newExprId) {
+      this
+    } else {
+      AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifiers)
+    }
+  }
+
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 65706dba7d975..daefc016bc91c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -67,7 +67,7 @@ trait PredicateHelper {
 
   /**
    * Returns true if `expr` can be evaluated using only the output of `plan`.  This method
-   * can be used to determine when is is acceptable to move expression evaluation within a query
+   * can be used to determine when it is acceptable to move expression evaluation within a query
    * plan.
    *
    * For example consider a join between two relations R(a, b) and S(c, d).
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 48d02bb534501..a09d5b6e3ad14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -255,7 +255,7 @@ object StringTranslate {
     val dict = new HashMap[Character, Character]()
     var i = 0
     while (i < matching.length()) {
-      val rep = if (i < replace.length()) replace.charAt(i) else '\0'
+      val rep = if (i < replace.length()) replace.charAt(i) else '\u0000'
       if (null == dict.get(matching.charAt(i))) {
         dict.put(matching.charAt(i), rep)
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a430000bef653..63602eaa8ccd8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -85,9 +85,24 @@ object SamplePushDown extends Rule[LogicalPlan] {
 }
 
 /**
- * Pushes operations to either side of a Union, Intersect or Except.
+ * Pushes certain operations to both sides of a Union, Intersect or Except operator.
+ * Operations that are safe to pushdown are listed as follows.
+ * Union:
+ * Right now, Union means UNION ALL, which does not de-duplicate rows. So, it is
+ * safe to pushdown Filters and Projections through it. Once we add UNION DISTINCT,
+ * we will not be able to pushdown Projections.
+ *
+ * Intersect:
+ * It is not safe to pushdown Projections through it because we need to get the
+ * intersect of rows by comparing the entire rows. It is fine to pushdown Filters
+ * with deterministic condition.
+ *
+ * Except:
+ * It is not safe to pushdown Projections through it because we need to get the
+ * intersect of rows by comparing the entire rows. It is fine to pushdown Filters
+ * with deterministic condition.
  */
-object SetOperationPushDown extends Rule[LogicalPlan] {
+object SetOperationPushDown extends Rule[LogicalPlan] with PredicateHelper {
 
   /**
    * Maps Attributes from the left side to the corresponding Attribute on the right side.
@@ -114,48 +129,65 @@ object SetOperationPushDown extends Rule[LogicalPlan] {
     result.asInstanceOf[A]
   }
 
+  /**
+   * Splits the condition expression into small conditions by `And`, and partition them by
+   * deterministic, and finally recombine them by `And`. It returns an expression containing
+   * all deterministic expressions (the first field of the returned Tuple2) and an expression
+   * containing all non-deterministic expressions (the second field of the returned Tuple2).
+   */
+  private def partitionByDeterministic(condition: Expression): (Expression, Expression) = {
+    val andConditions = splitConjunctivePredicates(condition)
+    andConditions.partition(_.deterministic) match {
+      case (deterministic, nondeterministic) =>
+        deterministic.reduceOption(And).getOrElse(Literal(true)) ->
+        nondeterministic.reduceOption(And).getOrElse(Literal(true))
+    }
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Push down filter into union
     case Filter(condition, u @ Union(left, right)) =>
+      val (deterministic, nondeterministic) = partitionByDeterministic(condition)
       val rewrites = buildRewrites(u)
-      Union(
-        Filter(condition, left),
-        Filter(pushToRight(condition, rewrites), right))
-
-    // Push down projection into union
-    case Project(projectList, u @ Union(left, right)) =>
-      val rewrites = buildRewrites(u)
-      Union(
-        Project(projectList, left),
-        Project(projectList.map(pushToRight(_, rewrites)), right))
+      Filter(nondeterministic,
+        Union(
+          Filter(deterministic, left),
+          Filter(pushToRight(deterministic, rewrites), right)
+        )
+      )
+
+    // Push down deterministic projection through UNION ALL
+    case p @ Project(projectList, u @ Union(left, right)) =>
+      if (projectList.forall(_.deterministic)) {
+        val rewrites = buildRewrites(u)
+        Union(
+          Project(projectList, left),
+          Project(projectList.map(pushToRight(_, rewrites)), right))
+      } else {
+        p
+      }
 
-    // Push down filter into intersect
+    // Push down filter through INTERSECT
     case Filter(condition, i @ Intersect(left, right)) =>
+      val (deterministic, nondeterministic) = partitionByDeterministic(condition)
       val rewrites = buildRewrites(i)
-      Intersect(
-        Filter(condition, left),
-        Filter(pushToRight(condition, rewrites), right))
-
-    // Push down projection into intersect
-    case Project(projectList, i @ Intersect(left, right)) =>
-      val rewrites = buildRewrites(i)
-      Intersect(
-        Project(projectList, left),
-        Project(projectList.map(pushToRight(_, rewrites)), right))
+      Filter(nondeterministic,
+        Intersect(
+          Filter(deterministic, left),
+          Filter(pushToRight(deterministic, rewrites), right)
+        )
+      )
 
-    // Push down filter into except
+    // Push down filter through EXCEPT
     case Filter(condition, e @ Except(left, right)) =>
+      val (deterministic, nondeterministic) = partitionByDeterministic(condition)
       val rewrites = buildRewrites(e)
-      Except(
-        Filter(condition, left),
-        Filter(pushToRight(condition, rewrites), right))
-
-    // Push down projection into except
-    case Project(projectList, e @ Except(left, right)) =>
-      val rewrites = buildRewrites(e)
-      Except(
-        Project(projectList, left),
-        Project(projectList.map(pushToRight(_, rewrites)), right))
+      Filter(nondeterministic,
+        Except(
+          Filter(deterministic, left),
+          Filter(pushToRight(deterministic, rewrites), right)
+        )
+      )
   }
 }
 
@@ -228,10 +260,21 @@ object ColumnPruning extends Rule[LogicalPlan] {
     case Project(projectList, Limit(exp, child)) =>
       Limit(exp, Project(projectList, child))
 
-    // Push down project if possible when the child is sort
-    case p @ Project(projectList, s @ Sort(_, _, grandChild))
-      if s.references.subsetOf(p.outputSet) =>
-      s.copy(child = Project(projectList, grandChild))
+    // Push down project if possible when the child is sort.
+    case p @ Project(projectList, s @ Sort(_, _, grandChild)) =>
+      if (s.references.subsetOf(p.outputSet)) {
+        s.copy(child = Project(projectList, grandChild))
+      } else {
+        val neededReferences = s.references ++ p.references
+        if (neededReferences == grandChild.outputSet) {
+          // No column we can prune, return the original plan.
+          p
+        } else {
+          // Do not use neededReferences.toSeq directly, should respect grandChild's output order.
+          val newProjectList = grandChild.output.filter(neededReferences.contains)
+          p.copy(child = s.copy(child = Project(newProjectList, grandChild)))
+        }
+      }
 
     // Eliminate no-op Projects
     case Project(projectList, child) if child.output == projectList => child
@@ -434,6 +477,11 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         case (_, Literal(false, BooleanType)) => Literal(false)
         // a && a  =>  a
         case (l, r) if l fastEquals r => l
+        // a && (not(a) || b) => a && b
+        case (l, Or(l1, r)) if (Not(l) == l1) => And(l, r)
+        case (l, Or(r, l1)) if (Not(l) == l1) => And(l, r)
+        case (Or(l, l1), r) if (l1 == Not(r)) => And(l, r)
+        case (Or(l1, l), r) if (l1 == Not(r)) => And(l, r)
         // (a || b) && (a || c)  =>  a || (b && c)
         case _ =>
           // 1. Split left and right to get the disjunctive predicates,
@@ -512,6 +560,10 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         case LessThan(l, r) => GreaterThanOrEqual(l, r)
         // not(l <= r)  =>  l > r
         case LessThanOrEqual(l, r) => GreaterThan(l, r)
+        // not(l || r) => not(l) && not(r)
+        case Or(l, r) => And(Not(l), Not(r))
+        // not(l && r) => not(l) or not(r)
+        case And(l, r) => Or(Not(l), Not(r))
         // not(not(e))  =>  e
         case Not(e) => e
         case _ => not
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
index 73a21884a4710..56a3dd02f9ba3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
@@ -51,7 +51,7 @@ abstract class QueryPlanner[PhysicalPlan <: TreeNode[PhysicalPlan]] {
    * filled in automatically by the QueryPlanner using the other execution strategies that are
    * available.
    */
-  protected def planLater(plan: LogicalPlan) = this.plan(plan).next()
+  protected def planLater(plan: LogicalPlan): PhysicalPlan = this.plan(plan).next()
 
   def plan(plan: LogicalPlan): Iterator[PhysicalPlan] = {
     // Obviously a lot to do here still...
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index b9ca712c1ee1c..53537799517ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -17,35 +17,12 @@
 
 package org.apache.spark.sql.catalyst.planning
 
-import scala.annotation.tailrec
-
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 
-/**
- * A pattern that matches any number of filter operations on top of another relational operator.
- * Adjacent filter operators are collected and their conditions are broken up and returned as a
- * sequence of conjunctive predicates.
- *
- * @return A tuple containing a sequence of conjunctive predicates that should be used to filter the
- *         output and a relational operator.
- */
-object FilteredOperation extends PredicateHelper {
-  type ReturnType = (Seq[Expression], LogicalPlan)
-
-  def unapply(plan: LogicalPlan): Option[ReturnType] = Some(collectFilters(Nil, plan))
-
-  @tailrec
-  private def collectFilters(filters: Seq[Expression], plan: LogicalPlan): ReturnType = plan match {
-    case Filter(condition, child) =>
-      collectFilters(filters ++ splitConjunctivePredicates(condition), child)
-    case other => (filters, other)
-  }
-}
-
 /**
  * A pattern that matches any number of project or filter operations on top of another relational
  * operator.  All filter operators are collected and their conditions are broken up and returned
@@ -62,8 +39,9 @@ object PhysicalOperation extends PredicateHelper {
   }
 
   /**
-   * Collects projects and filters, in-lining/substituting aliases if necessary.  Here are two
-   * examples for alias in-lining/substitution.  Before:
+   * Collects all deterministic projects and filters, in-lining/substituting aliases if necessary.
+   * Here are two examples for alias in-lining/substitution.
+   * Before:
    * {{{
    *   SELECT c1 FROM (SELECT key AS c1 FROM t1) t2 WHERE c1 > 10
    *   SELECT c1 AS c2 FROM (SELECT key AS c1 FROM t1) t2 WHERE c1 > 10
@@ -74,15 +52,15 @@ object PhysicalOperation extends PredicateHelper {
    *   SELECT key AS c2 FROM t1 WHERE key > 10
    * }}}
    */
-  def collectProjectsAndFilters(plan: LogicalPlan):
+  private def collectProjectsAndFilters(plan: LogicalPlan):
       (Option[Seq[NamedExpression]], Seq[Expression], LogicalPlan, Map[Attribute, Expression]) =
     plan match {
-      case Project(fields, child) =>
+      case Project(fields, child) if fields.forall(_.deterministic) =>
         val (_, filters, other, aliases) = collectProjectsAndFilters(child)
         val substitutedFields = fields.map(substitute(aliases)).asInstanceOf[Seq[NamedExpression]]
         (Some(substitutedFields), filters, other, collectAliases(substitutedFields))
 
-      case Filter(condition, child) =>
+      case Filter(condition, child) if condition.deterministic =>
         val (fields, filters, other, aliases) = collectProjectsAndFilters(child)
         val substitutedCondition = substitute(aliases)(condition)
         (fields, filters ++ splitConjunctivePredicates(substitutedCondition), other, aliases)
@@ -91,11 +69,11 @@ object PhysicalOperation extends PredicateHelper {
         (None, Nil, other, Map.empty)
     }
 
-  def collectAliases(fields: Seq[Expression]): Map[Attribute, Expression] = fields.collect {
+  private def collectAliases(fields: Seq[Expression]): Map[Attribute, Expression] = fields.collect {
     case a @ Alias(child, _) => a.toAttribute -> child
   }.toMap
 
-  def substitute(aliases: Map[Attribute, Expression])(expr: Expression): Expression = {
+  private def substitute(aliases: Map[Attribute, Expression])(expr: Expression): Expression = {
     expr.transform {
       case a @ Alias(ref: AttributeReference, name) =>
         aliases.get(ref).map(Alias(_, name)(a.exprId, a.qualifiers)).getOrElse(a)
@@ -195,8 +173,9 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       // as join keys.
       val (joinPredicates, otherPredicates) =
         condition.map(splitConjunctivePredicates).getOrElse(Nil).partition {
-          case EqualTo(l, r) if (canEvaluate(l, left) && canEvaluate(r, right)) ||
-            (canEvaluate(l, right) && canEvaluate(r, left)) => true
+          case EqualTo(l, r) =>
+            (canEvaluate(l, left) && canEvaluate(r, right)) ||
+            (canEvaluate(l, right) && canEvaluate(r, left))
           case _ => false
         }
 
@@ -204,10 +183,9 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
         case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => (l, r)
         case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => (r, l)
       }
-      val leftKeys = joinKeys.map(_._1)
-      val rightKeys = joinKeys.map(_._2)
 
       if (joinKeys.nonEmpty) {
+        val (leftKeys, rightKeys) = joinKeys.unzip
         logDebug(s"leftKeys:$leftKeys | rightKeys:$rightKeys")
         Some((joinType, leftKeys, rightKeys, otherPredicates.reduceOption(And), left, right))
       } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 55286f9f2fc5c..0ec9f08571082 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.plans
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, VirtualColumn}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.{DataType, StructType}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 9bb466ac2d29c..8f8747e105932 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -135,16 +135,25 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   /** Args that have cleaned such that differences in expression id should not affect equality */
   protected lazy val cleanArgs: Seq[Any] = {
     val input = children.flatMap(_.output)
+    def cleanExpression(e: Expression) = e match {
+      case a: Alias =>
+        // As the root of the expression, Alias will always take an arbitrary exprId, we need
+        // to erase that for equality testing.
+        val cleanedExprId = Alias(a.child, a.name)(ExprId(-1), a.qualifiers)
+        BindReferences.bindReference(cleanedExprId, input, allowFailures = true)
+      case other => BindReferences.bindReference(other, input, allowFailures = true)
+    }
+
     productIterator.map {
       // Children are checked using sameResult above.
       case tn: TreeNode[_] if containsChild(tn) => null
-      case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
+      case e: Expression => cleanExpression(e)
       case s: Option[_] => s.map {
-        case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
+        case e: Expression => cleanExpression(e)
         case other => other
       }
       case s: Seq[_] => s.map {
-        case e: Expression => BindReferences.bindReference(e, input, allowFailures = true)
+        case e: Expression => cleanExpression(e)
         case other => other
       }
       case other => other
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 722f69cdca827..ae9482c10f126 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -68,7 +68,7 @@ case class Generate(
     generator.resolved &&
       childrenResolved &&
       generator.elementTypes.length == generatorOutput.length &&
-      !generatorOutput.exists(!_.resolved)
+      generatorOutput.forall(_.resolved)
   }
 
   // we don't want the gOutput to be taken as part of the expressions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index d652fce3fd9b6..781ed1688a327 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.util
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 import java.util.{TimeZone, Calendar}
+import javax.xml.bind.DatatypeConverter
 
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -42,6 +43,7 @@ object DateTimeUtils {
   final val SECONDS_PER_DAY = 60 * 60 * 24L
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
+  final val MICROS_PER_DAY = MICROS_PER_SECOND * SECONDS_PER_DAY
 
   final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L
 
@@ -108,30 +110,22 @@ object DateTimeUtils {
   }
 
   def stringToTime(s: String): java.util.Date = {
-    if (!s.contains('T')) {
+    var indexOfGMT = s.indexOf("GMT");
+    if (indexOfGMT != -1) {
+      // ISO8601 with a weird time zone specifier (2000-01-01T00:00GMT+01:00)
+      val s0 = s.substring(0, indexOfGMT)
+      val s1 = s.substring(indexOfGMT + 3)
+      // Mapped to 2000-01-01T00:00+01:00
+      stringToTime(s0 + s1)
+    } else if (!s.contains('T')) {
       // JDBC escape string
       if (s.contains(' ')) {
         Timestamp.valueOf(s)
       } else {
         Date.valueOf(s)
       }
-    } else if (s.endsWith("Z")) {
-      // this is zero timezone of ISO8601
-      stringToTime(s.substring(0, s.length - 1) + "GMT-00:00")
-    } else if (s.indexOf("GMT") == -1) {
-      // timezone with ISO8601
-      val inset = "+00.00".length
-      val s0 = s.substring(0, s.length - inset)
-      val s1 = s.substring(s.length - inset, s.length)
-      if (s0.substring(s0.lastIndexOf(':')).contains('.')) {
-        stringToTime(s0 + "GMT" + s1)
-      } else {
-        stringToTime(s0 + ".0GMT" + s1)
-      }
     } else {
-      // ISO8601 with GMT insert
-      val ISO8601GMT: SimpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSSz" )
-      ISO8601GMT.parse(s)
+      DatatypeConverter.parseDateTime(s).getTime()
     }
   }
 
@@ -190,13 +184,14 @@ object DateTimeUtils {
 
   /**
    * Returns Julian day and nanoseconds in a day from the number of microseconds
+   *
+   * Note: support timestamp since 4717 BC (without negative nanoseconds, compatible with Hive).
    */
   def toJulianDay(us: SQLTimestamp): (Int, Long) = {
-    val seconds = us / MICROS_PER_SECOND
-    val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
-    val secondsInDay = seconds % SECONDS_PER_DAY
-    val nanos = (us % MICROS_PER_SECOND) * 1000L
-    (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
+    val julian_us = us + JULIAN_DAY_OF_EPOCH * MICROS_PER_DAY
+    val day = julian_us / MICROS_PER_DAY
+    val micros = julian_us % MICROS_PER_DAY
+    (day.toInt, micros * 1000L)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index 9ddfb3a0d3759..c2eeb3c5650ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.util
 
 import java.util.regex.Pattern
 
+import org.apache.spark.unsafe.types.UTF8String
+
 object StringUtils {
 
   // replace the _ with .{1} exactly match 1 time of any character
@@ -44,4 +46,10 @@ object StringUtils {
       v
     }
   }
+
+  private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
+  private[this] val falseStrings = Set("f", "false", "n", "no", "0").map(UTF8String.fromString)
+
+  def isTrueString(s: UTF8String): Boolean = trueStrings.contains(s.toLowerCase)
+  def isFalseString(s: UTF8String): Boolean = falseStrings.contains(s.toLowerCase)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index e0667c629486d..1d2d007c2b4d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -126,7 +126,7 @@ protected[sql] object AnyDataType extends AbstractDataType {
  */
 protected[sql] abstract class AtomicType extends DataType {
   private[sql] type InternalType
-  @transient private[sql] val tag: TypeTag[InternalType]
+  private[sql] val tag: TypeTag[InternalType]
   private[sql] val ordering: Ordering[InternalType]
 
   @transient private[sql] val classTag = ScalaReflectionLock.synchronized {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 7bcd623b3f33e..4b54c31dcc27a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -51,7 +51,9 @@ abstract class DataType extends AbstractDataType {
   def defaultSize: Int
 
   /** Name of the type used in JSON serialization. */
-  def typeName: String = this.getClass.getSimpleName.stripSuffix("$").dropRight(4).toLowerCase
+  def typeName: String = {
+    this.getClass.getSimpleName.stripSuffix("$").stripSuffix("Type").stripSuffix("UDT").toLowerCase
+  }
 
   private[sql] def jsonValue: JValue = typeName
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index c988f1d1b972e..bfcf111385b7e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -145,7 +145,13 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     }
   }
 
-  def toJavaBigDecimal: java.math.BigDecimal = toBigDecimal.underlying()
+  def toJavaBigDecimal: java.math.BigDecimal = {
+    if (decimalVal.ne(null)) {
+      decimalVal.underlying()
+    } else {
+      java.math.BigDecimal.valueOf(longVal, _scale)
+    }
+  }
 
   def toUnscaledLong: Long = {
     if (decimalVal.ne(null)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index d8968ef806390..d6b436724b2a0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -305,7 +305,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     f(this) || fields.exists(field => field.dataType.existsRecursively(f))
   }
 
-  private[sql] val interpretedOrdering = InterpretedOrdering.forSchema(this.fields.map(_.dataType))
+  @transient
+  private[sql] lazy val interpretedOrdering =
+    InterpretedOrdering.forSchema(this.fields.map(_.dataType))
 }
 
 object StructType extends AbstractDataType {
@@ -371,10 +373,19 @@ object StructType extends AbstractDataType {
         StructType(newFields)
 
       case (DecimalType.Fixed(leftPrecision, leftScale),
-      DecimalType.Fixed(rightPrecision, rightScale)) =>
-        DecimalType(
-          max(leftScale, rightScale) + max(leftPrecision - leftScale, rightPrecision - rightScale),
-          max(leftScale, rightScale))
+        DecimalType.Fixed(rightPrecision, rightScale)) =>
+        if ((leftPrecision == rightPrecision) && (leftScale == rightScale)) {
+          DecimalType(leftPrecision, leftScale)
+        } else if ((leftPrecision != rightPrecision) && (leftScale != rightScale)) {
+          throw new SparkException("Failed to merge Decimal Tpes with incompatible " +
+            s"precision $leftPrecision and $rightPrecision & scale $leftScale and $rightScale")
+        } else if (leftPrecision != rightPrecision) {
+          throw new SparkException("Failed to merge Decimal Tpes with incompatible " +
+            s"precision $leftPrecision and $rightPrecision")
+        } else {
+          throw new SparkException("Failed to merge Decimal Tpes with incompatible " +
+            s"scala $leftScale and $rightScale")
+        }
 
       case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
         if leftUdt.userClass == rightUdt.userClass => leftUdt
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 11e0c120f4072..e48395028e399 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -23,6 +23,8 @@ import java.math.MathContext
 
 import scala.util.Random
 
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 
@@ -84,6 +86,7 @@ object RandomDataGenerator {
    * random data generator is defined for that data type. The generated values will use an external
    * representation of the data type; for example, the random generator for [[DateType]] will return
    * instances of [[java.sql.Date]] and the generator for [[StructType]] will return a [[Row]].
+   * For a [[UserDefinedType]] for a class X, an instance of class X is returned.
    *
    * @param dataType the type to generate values for
    * @param nullable whether null values should be generated
@@ -105,8 +108,37 @@ object RandomDataGenerator {
         arr
       })
       case BooleanType => Some(() => rand.nextBoolean())
-      case DateType => Some(() => new java.sql.Date(rand.nextInt()))
-      case TimestampType => Some(() => new java.sql.Timestamp(rand.nextLong()))
+      case DateType =>
+        val generator =
+          () => {
+            var milliseconds = rand.nextLong() % 253402329599999L
+            // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
+            // for "0001-01-01 00:00:00.000000". We need to find a
+            // number that is greater or equals to this number as a valid timestamp value.
+            while (milliseconds < -62135740800000L) {
+              // 253402329599999L is the the number of milliseconds since
+              // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
+              milliseconds = rand.nextLong() % 253402329599999L
+            }
+            DateTimeUtils.toJavaDate((milliseconds / DateTimeUtils.MILLIS_PER_DAY).toInt)
+          }
+        Some(generator)
+      case TimestampType =>
+        val generator =
+          () => {
+            var milliseconds = rand.nextLong() % 253402329599999L
+            // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
+            // for "0001-01-01 00:00:00.000000". We need to find a
+            // number that is greater or equals to this number as a valid timestamp value.
+            while (milliseconds < -62135740800000L) {
+              // 253402329599999L is the the number of milliseconds since
+              // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
+              milliseconds = rand.nextLong() % 253402329599999L
+            }
+            // DateTimeUtils.toJavaTimestamp takes microsecond.
+            DateTimeUtils.toJavaTimestamp(milliseconds * 1000)
+          }
+        Some(generator)
       case CalendarIntervalType => Some(() => {
         val months = rand.nextInt(1000)
         val ns = rand.nextLong()
@@ -159,6 +191,27 @@ object RandomDataGenerator {
           None
         }
       }
+      case udt: UserDefinedType[_] => {
+        val maybeSqlTypeGenerator = forType(udt.sqlType, nullable, seed)
+        // Because random data generator at here returns scala value, we need to
+        // convert it to catalyst value to call udt's deserialize.
+        val toCatalystType = CatalystTypeConverters.createToCatalystConverter(udt.sqlType)
+
+        if (maybeSqlTypeGenerator.isDefined) {
+          val sqlTypeGenerator = maybeSqlTypeGenerator.get
+          val generator = () => {
+            val generatedScalaValue = sqlTypeGenerator.apply()
+            if (generatedScalaValue == null) {
+              null
+            } else {
+              udt.deserialize(toCatalystType(generatedScalaValue))
+            }
+          }
+          Some(generator)
+        } else {
+          None
+        }
+      }
       case unsupportedType => None
     }
     // Handle nullability by wrapping the non-null value generator:
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1ad70733eae03..f4db4da7646f8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -503,9 +503,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("cast from array") {
-    val array = Literal.create(Seq("123", "abc", "", null),
+    val array = Literal.create(Seq("123", "true", "f", null),
       ArrayType(StringType, containsNull = true))
-    val array_notNull = Literal.create(Seq("123", "abc", ""),
+    val array_notNull = Literal.create(Seq("123", "true", "f"),
       ArrayType(StringType, containsNull = false))
 
     checkNullCast(ArrayType(StringType), ArrayType(IntegerType))
@@ -522,7 +522,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(array, ArrayType(BooleanType, containsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Seq(true, true, false, null))
+      checkEvaluation(ret, Seq(null, true, false, null))
     }
     {
       val ret = cast(array, ArrayType(BooleanType, containsNull = false))
@@ -541,12 +541,12 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Seq(true, true, false))
+      checkEvaluation(ret, Seq(null, true, false))
     }
     {
       val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = false))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Seq(true, true, false))
+      checkEvaluation(ret, Seq(null, true, false))
     }
 
     {
@@ -557,10 +557,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("cast from map") {
     val map = Literal.create(
-      Map("a" -> "123", "b" -> "abc", "c" -> "", "d" -> null),
+      Map("a" -> "123", "b" -> "true", "c" -> "f", "d" -> null),
       MapType(StringType, StringType, valueContainsNull = true))
     val map_notNull = Literal.create(
-      Map("a" -> "123", "b" -> "abc", "c" -> ""),
+      Map("a" -> "123", "b" -> "true", "c" -> "f"),
       MapType(StringType, StringType, valueContainsNull = false))
 
     checkNullCast(MapType(StringType, IntegerType), MapType(StringType, StringType))
@@ -577,7 +577,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false, "d" -> null))
+      checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false, "d" -> null))
     }
     {
       val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = false))
@@ -600,12 +600,12 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     {
       val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false))
+      checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false))
     }
     {
       val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = false))
       assert(ret.resolved === true)
-      checkEvaluation(ret, Map("a" -> true, "b" -> true, "c" -> false))
+      checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false))
     }
     {
       val ret = cast(map_notNull, MapType(IntegerType, StringType, valueContainsNull = true))
@@ -630,8 +630,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val struct = Literal.create(
       InternalRow(
         UTF8String.fromString("123"),
-        UTF8String.fromString("abc"),
-        UTF8String.fromString(""),
+        UTF8String.fromString("true"),
+        UTF8String.fromString("f"),
         null),
       StructType(Seq(
         StructField("a", StringType, nullable = true),
@@ -641,8 +641,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val struct_notNull = Literal.create(
       InternalRow(
         UTF8String.fromString("123"),
-        UTF8String.fromString("abc"),
-        UTF8String.fromString("")),
+        UTF8String.fromString("true"),
+        UTF8String.fromString("f")),
       StructType(Seq(
         StructField("a", StringType, nullable = false),
         StructField("b", StringType, nullable = false),
@@ -672,7 +672,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("c", BooleanType, nullable = true),
         StructField("d", BooleanType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, InternalRow(true, true, false, null))
+      checkEvaluation(ret, InternalRow(null, true, false, null))
     }
     {
       val ret = cast(struct, StructType(Seq(
@@ -704,7 +704,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", BooleanType, nullable = true),
         StructField("c", BooleanType, nullable = true))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, InternalRow(true, true, false))
+      checkEvaluation(ret, InternalRow(null, true, false))
     }
     {
       val ret = cast(struct_notNull, StructType(Seq(
@@ -712,7 +712,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
         StructField("b", BooleanType, nullable = true),
         StructField("c", BooleanType, nullable = false))))
       assert(ret.resolved === true)
-      checkEvaluation(ret, InternalRow(true, true, false))
+      checkEvaluation(ret, InternalRow(null, true, false))
     }
 
     {
@@ -731,8 +731,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("complex casting") {
     val complex = Literal.create(
       Row(
-        Seq("123", "abc", ""),
-        Map("a" ->"123", "b" -> "abc", "c" -> ""),
+        Seq("123", "true", "f"),
+        Map("a" ->"123", "b" -> "true", "c" -> "f"),
         Row(0)),
       StructType(Seq(
         StructField("a",
@@ -755,11 +755,11 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(ret.resolved === true)
     checkEvaluation(ret, Row(
       Seq(123, null, null),
-      Map("a" -> true, "b" -> true, "c" -> false),
+      Map("a" -> null, "b" -> true, "c" -> false),
       Row(0L)))
   }
 
-  test("case between string and interval") {
+  test("cast between string and interval") {
     import org.apache.spark.unsafe.types.CalendarInterval
 
     checkEvaluation(Cast(Literal("interval -3 month 7 hours"), CalendarIntervalType),
@@ -769,4 +769,23 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType),
       "interval 1 years 3 months -3 days")
   }
+
+  test("cast string to boolean") {
+    checkCast("t", true)
+    checkCast("true", true)
+    checkCast("tRUe", true)
+    checkCast("y", true)
+    checkCast("yes", true)
+    checkCast("1", true)
+
+    checkCast("f", false)
+    checkCast("false", false)
+    checkCast("FAlsE", false)
+    checkCast("n", false)
+    checkCast("no", false)
+    checkCast("0", false)
+
+    checkEvaluation(cast("abc", BooleanType), null)
+    checkEvaluation(cast("", BooleanType), null)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 4addbaf0cbce7..f33125f463e14 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.unsafe.types.UTF8String
 
 class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   val json =
@@ -199,4 +201,116 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       GetJsonObject(NonFoldableLiteral(json), NonFoldableLiteral("$.fb:testid")),
       "1234")
   }
+
+  val jsonTupleQuery = Literal("f1") ::
+    Literal("f2") ::
+    Literal("f3") ::
+    Literal("f4") ::
+    Literal("f5") ::
+    Nil
+
+  test("json_tuple - hive key 1") {
+    checkEvaluation(
+      JsonTuple(
+        Literal("""{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") ::
+          jsonTupleQuery),
+      InternalRow.fromSeq(Seq("value1", "value2", "3", null, "5.23").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 2") {
+    checkEvaluation(
+      JsonTuple(
+        Literal("""{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
+          jsonTupleQuery),
+      InternalRow.fromSeq(Seq("value12", "2", "value3", "4.01", null).map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 2 (mix of foldable fields)") {
+    checkEvaluation(
+      JsonTuple(Literal("""{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
+        Literal("f1") ::
+        NonFoldableLiteral("f2") ::
+        NonFoldableLiteral("f3") ::
+        Literal("f4") ::
+        Literal("f5") ::
+        Nil),
+      InternalRow.fromSeq(Seq("value12", "2", "value3", "4.01", null).map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 3") {
+    checkEvaluation(
+      JsonTuple(
+        Literal("""{"f1": "value13", "f4": "value44", "f3": "value33", "f2": 2, "f5": 5.01}""") ::
+          jsonTupleQuery),
+      InternalRow.fromSeq(
+        Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 3 (nonfoldable json)") {
+    checkEvaluation(
+      JsonTuple(
+        NonFoldableLiteral(
+          """{"f1": "value13", "f4": "value44",
+            | "f3": "value33", "f2": 2, "f5": 5.01}""".stripMargin)
+          :: jsonTupleQuery),
+      InternalRow.fromSeq(
+        Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 3 (nonfoldable fields)") {
+    checkEvaluation(
+      JsonTuple(Literal(
+        """{"f1": "value13", "f4": "value44",
+          | "f3": "value33", "f2": 2, "f5": 5.01}""".stripMargin) ::
+        NonFoldableLiteral("f1") ::
+        NonFoldableLiteral("f2") ::
+        NonFoldableLiteral("f3") ::
+        NonFoldableLiteral("f4") ::
+        NonFoldableLiteral("f5") ::
+        Nil),
+      InternalRow.fromSeq(
+        Seq("value13", "2", "value33", "value44", "5.01").map(UTF8String.fromString)))
+  }
+
+  test("json_tuple - hive key 4 - null json") {
+    checkEvaluation(
+      JsonTuple(Literal(null) :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - hive key 5 - null and empty fields") {
+    checkEvaluation(
+      JsonTuple(Literal("""{"f1": "", "f5": null}""") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(UTF8String.fromString(""), null, null, null, null)))
+  }
+
+  test("json_tuple - hive key 6 - invalid json (array)") {
+    checkEvaluation(
+      JsonTuple(Literal("[invalid JSON string]") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - invalid json (object start only)") {
+    checkEvaluation(
+      JsonTuple(Literal("{") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - invalid json (no object end)") {
+    checkEvaluation(
+      JsonTuple(Literal("""{"foo": "bar"""") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - invalid json (invalid json)") {
+    checkEvaluation(
+      JsonTuple(Literal("\\") :: jsonTupleQuery),
+      InternalRow.fromSeq(Seq(null, null, null, null, null)))
+  }
+
+  test("json_tuple - preserve newlines") {
+    checkEvaluation(
+      JsonTuple(Literal("{\"a\":\"b\nc\"}") :: Literal("a") :: Nil),
+      InternalRow.fromSeq(Seq(UTF8String.fromString("b\nc"))))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 90c59f240b542..1b2a9163a3d09 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -244,12 +244,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("ceil") {
-    testUnary(Ceil, math.ceil)
+    testUnary(Ceil, (d: Double) => math.ceil(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Ceil, DoubleType)
   }
 
   test("floor") {
-    testUnary(Floor, math.floor)
+    testUnary(Floor, (d: Double) => math.floor(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Floor, DoubleType)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
new file mode 100644
index 0000000000000..ecc0644164de2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import java.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, MutableRow, BoundReference}
+import org.apache.spark.sql.types.{DataType, IntegerType}
+
+import scala.collection.mutable
+import org.scalatest.Assertions._
+
+class HyperLogLogPlusPlusSuite extends SparkFunSuite {
+
+  /** Create a HLL++ instance and an input and output buffer. */
+  def createEstimator(rsd: Double, dt: DataType = IntegerType):
+      (HyperLogLogPlusPlus, MutableRow, MutableRow) = {
+    val input = new SpecificMutableRow(Seq(dt))
+    val hll = new HyperLogLogPlusPlus(new BoundReference(0, dt, true), rsd)
+    val buffer = createBuffer(hll)
+    (hll, input, buffer)
+  }
+
+  def createBuffer(hll: HyperLogLogPlusPlus): MutableRow = {
+    val buffer = new SpecificMutableRow(hll.bufferAttributes.map(_.dataType))
+    hll.initialize(buffer)
+    buffer
+  }
+
+  /** Evaluate the estimate. It should be within 3*SD's of the given true rsd. */
+  def evaluateEstimate(hll: HyperLogLogPlusPlus, buffer: MutableRow, cardinality: Int): Unit = {
+    val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble
+    val error = math.abs((estimate / cardinality.toDouble) - 1.0d)
+    assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
+  }
+
+  test("add nulls") {
+    val (hll, input, buffer) = createEstimator(0.05)
+    input.setNullAt(0)
+    hll.update(buffer, input)
+    hll.update(buffer, input)
+    val estimate = hll.eval(buffer).asInstanceOf[Long]
+    assert(estimate == 0L, "Nothing meaningful added; estimate should be 0.")
+  }
+
+  def testCardinalityEstimates(
+      rsds: Seq[Double],
+      ns: Seq[Int],
+      f: Int => Int,
+      c: Int => Int): Unit = {
+    rsds.flatMap(rsd => ns.map(n => (rsd, n))).foreach {
+      case (rsd, n) =>
+        val (hll, input, buffer) = createEstimator(rsd)
+        var i = 0
+        while (i < n) {
+          input.setInt(0, f(i))
+          hll.update(buffer, input)
+          i += 1
+        }
+        val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble
+        val cardinality = c(n)
+        val error = math.abs((estimate / cardinality.toDouble) - 1.0d)
+        assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
+    }
+  }
+
+  test("deterministic cardinality estimation") {
+    val repeats = 10
+    testCardinalityEstimates(
+      Seq(0.1, 0.05, 0.025, 0.01),
+      Seq(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000).map(_ * repeats),
+      i => i / repeats,
+      i => i / repeats)
+  }
+
+  test("random cardinality estimation") {
+    val srng = new Random(323981238L)
+    val seen = mutable.HashSet.empty[Int]
+    val update = (i: Int) => {
+      val value = srng.nextInt()
+      seen += value
+      value
+    }
+    val eval = (n: Int) => {
+      val cardinality = seen.size
+      seen.clear()
+      cardinality
+    }
+    testCardinalityEstimates(
+      Seq(0.05, 0.01),
+      Seq(100, 10000, 500000),
+      update,
+      eval)
+  }
+
+  // Test merging
+  test("merging HLL instances") {
+    val (hll, input, buffer1a) = createEstimator(0.05)
+    val buffer1b = createBuffer(hll)
+    val buffer2 = createBuffer(hll)
+
+    // Create the
+    // Add the lower half
+    var i = 0
+    while (i < 500000) {
+      input.setInt(0, i)
+      hll.update(buffer1a, input)
+      i += 1
+    }
+
+    // Add the upper half
+    i = 500000
+    while (i < 1000000) {
+      input.setInt(0, i)
+      hll.update(buffer1b, input)
+      i += 1
+    }
+
+    // Merge the lower and upper halfs.
+    hll.merge(buffer1a, buffer1b)
+
+    // Create the other buffer in reverse
+    i = 999999
+    while (i >= 0) {
+      input.setInt(0, i)
+      hll.update(buffer2, input)
+      i -= 1
+    }
+
+    // Check if the buffers are equal.
+    assert(buffer2 == buffer1a, "Buffers should be equal")
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index 1877cff1334bd..cde346e99eb17 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -89,6 +89,22 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
       ('a === 'b || 'b > 3 && 'a > 3 && 'a < 5))
   }
 
+  test("a && (!a || b)") {
+    checkCondition(('a && (!('a) || 'b )), ('a && 'b))
+
+    checkCondition(('a && ('b || !('a) )), ('a && 'b))
+
+    checkCondition(((!('a) || 'b ) && 'a), ('b && 'a))
+
+    checkCondition((('b || !('a) ) && 'a), ('b && 'a))
+  }
+
+  test("!(a && b) , !(a || b)") {
+    checkCondition((!('a && 'b)), (!('a) || !('b)))
+
+    checkCondition(!('a || 'b), (!('a) && !('b)))
+  }
+
   private val caseInsensitiveAnalyzer =
     new Analyzer(EmptyCatalog, EmptyFunctionRegistry, new SimpleCatalystConf(false))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index dbebcb86809de..4a1e7ceaf394b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -80,5 +80,16 @@ class ColumnPruningSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("Column pruning for Project on Sort") {
+    val input = LocalRelation('a.int, 'b.string, 'c.double)
+
+    val query = input.orderBy('b.asc).select('a).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer = input.select('a, 'b).orderBy('b.asc).select('a).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   // todo: add more tests for column pruning
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
index 49c979bc7d72c..1595ad9327423 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
@@ -30,7 +30,8 @@ class SetOperationPushDownSuite extends PlanTest {
       Batch("Subqueries", Once,
         EliminateSubQueries) ::
       Batch("Union Pushdown", Once,
-        SetOperationPushDown) :: Nil
+        SetOperationPushDown,
+        SimplifyFilters) :: Nil
   }
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
@@ -60,23 +61,22 @@ class SetOperationPushDownSuite extends PlanTest {
     comparePlans(exceptOptimized, exceptCorrectAnswer)
   }
 
-  test("union/intersect/except: project to each side") {
+  test("union: project to each side") {
     val unionQuery = testUnion.select('a)
+    val unionOptimized = Optimize.execute(unionQuery.analyze)
+    val unionCorrectAnswer =
+      Union(testRelation.select('a), testRelation2.select('d)).analyze
+    comparePlans(unionOptimized, unionCorrectAnswer)
+  }
+
+  test("SPARK-10539: Project should not be pushed down through Intersect or Except") {
     val intersectQuery = testIntersect.select('b, 'c)
     val exceptQuery = testExcept.select('a, 'b, 'c)
 
-    val unionOptimized = Optimize.execute(unionQuery.analyze)
     val intersectOptimized = Optimize.execute(intersectQuery.analyze)
     val exceptOptimized = Optimize.execute(exceptQuery.analyze)
 
-    val unionCorrectAnswer =
-      Union(testRelation.select('a), testRelation2.select('d)).analyze
-    val intersectCorrectAnswer =
-      Intersect(testRelation.select('b, 'c), testRelation2.select('e, 'f)).analyze
-    val exceptCorrectAnswer =
-      Except(testRelation.select('a, 'b, 'c), testRelation2.select('d, 'e, 'f)).analyze
-
-    comparePlans(unionOptimized, unionCorrectAnswer)
-    comparePlans(intersectOptimized, intersectCorrectAnswer)
-    comparePlans(exceptOptimized, exceptCorrectAnswer)  }
+    comparePlans(intersectOptimized, intersectQuery.analyze)
+    comparePlans(exceptOptimized, exceptQuery.analyze)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 765c1e2dda99f..f76a903dcc9cf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.util._
  * Provides helper methods for comparing plans.
  */
 class PlanTest extends SparkFunSuite {
-
   /**
    * Since attribute references are given globally unique ids during analysis,
    * we must normalize them to check if two different queries are identical.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 1596bb79fa94b..46335941b62d6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -52,15 +52,14 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(ns === 0)
     assert(fromJulianDay(d, ns) == 0L)
 
-    val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
-    val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
-    val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
-    assert(t.equals(t1))
-
-    val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
-    val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
-    val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
-    assert(t2.equals(t22))
+    Seq(Timestamp.valueOf("2015-06-11 10:10:10.100"),
+      Timestamp.valueOf("2015-06-11 20:10:10.100"),
+      Timestamp.valueOf("1900-06-11 20:10:10.100")).foreach { t =>
+      val (d, ns) = toJulianDay(fromJavaTimestamp(t))
+      assert(ns > 0)
+      val t1 = toJavaTimestamp(fromJulianDay(d, ns))
+      assert(t.equals(t1))
+    }
   }
 
   test("SPARK-6785: java date conversion before and after epoch") {
@@ -137,6 +136,38 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(stringToDate(UTF8String.fromString("2015-031-8")).isEmpty)
   }
 
+  test("string to time") {
+    // Tests with UTC.
+    var c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.set(Calendar.MILLISECOND, 0)
+
+    c.set(1900, 0, 1, 0, 0, 0)
+    assert(stringToTime("1900-01-01T00:00:00GMT-00:00") === c.getTime())
+
+    c.set(2000, 11, 30, 10, 0, 0)
+    assert(stringToTime("2000-12-30T10:00:00Z") === c.getTime())
+
+    // Tests with set time zone.
+    c.setTimeZone(TimeZone.getTimeZone("GMT-04:00"))
+    c.set(Calendar.MILLISECOND, 0)
+
+    c.set(1900, 0, 1, 0, 0, 0)
+    assert(stringToTime("1900-01-01T00:00:00-04:00") === c.getTime())
+
+    c.set(1900, 0, 1, 0, 0, 0)
+    assert(stringToTime("1900-01-01T00:00:00GMT-04:00") === c.getTime())
+
+    // Tests with local time zone.
+    c.setTimeZone(TimeZone.getDefault())
+    c.set(Calendar.MILLISECOND, 0)
+
+    c.set(2000, 11, 30, 0, 0, 0)
+    assert(stringToTime("2000-12-30") === new Date(c.getTimeInMillis()))
+
+    c.set(2000, 11, 30, 10, 0, 0)
+    assert(stringToTime("2000-12-30 10:00:00") === new Timestamp(c.getTimeInMillis()))
+  }
+
   test("string to timestamp") {
     var c = Calendar.getInstance()
     c.set(1969, 11, 31, 16, 0, 0)
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 349007789f634..465aa3a3888c2 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index 7db6b7ff50f22..9df5780e4fd84 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -85,6 +85,7 @@ public UnsafeKVExternalSorter(StructType keySchema, StructType valueSchema,
       // We will use the number of elements in the map as the initialSize of the
       // UnsafeInMemorySorter. Because UnsafeInMemorySorter does not accept 0 as the initialSize,
       // we will use 1 as its initial size if the map is empty.
+      // TODO: track pointer array memory used by this in-memory sorter! (SPARK-10474)
       final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter(
         taskMemoryManager, recordComparator, prefixComparator, Math.max(1, map.numElements()));
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 791c10c3d7ce7..01f60aba87ede 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
-import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, SQLExecution}
+import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
 import org.apache.spark.sql.sources.HadoopFsRelation
@@ -114,7 +114,7 @@ private[sql] object DataFrame {
 @Experimental
 class DataFrame private[sql](
     @transient val sqlContext: SQLContext,
-    @DeveloperApi @transient val queryExecution: SQLContext#QueryExecution) extends Serializable {
+    @DeveloperApi @transient val queryExecution: QueryExecution) extends Serializable {
 
   // Note for Spark contributors: if adding or updating any action in `DataFrame`, please make sure
   // you wrap it with `withNewExecutionId` if this actions doesn't call other action.
@@ -320,9 +320,8 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def explain(extended: Boolean): Unit = {
-    ExplainCommand(
-      queryExecution.logical,
-      extended = extended).queryExecution.executedPlan.executeCollect().map {
+    val explain = ExplainCommand(queryExecution.logical, extended = extended)
+    explain.queryExecution.executedPlan.executeCollect().foreach {
       // scalastyle:off println
       r => println(r.getString(0))
       // scalastyle:on println
@@ -484,6 +483,26 @@ class DataFrame private[sql](
    * @since 1.4.0
    */
   def join(right: DataFrame, usingColumns: Seq[String]): DataFrame = {
+    join(right, usingColumns, "inner")
+  }
+
+  /**
+   * Equi-join with another [[DataFrame]] using the given columns.
+   *
+   * Different from other join functions, the join columns will only appear once in the output,
+   * i.e. similar to SQL's `JOIN USING` syntax.
+   *
+   * Note that if you perform a self-join using this function without aliasing the input
+   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
+   * @param right Right side of the join operation.
+   * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
+   * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
+   * @group dfops
+   * @since 1.6.0
+   */
+  def join(right: DataFrame, usingColumns: Seq[String], joinType: String): DataFrame = {
     // Analyze the self join. The assumption is that the analyzer will disambiguate left vs right
     // by creating a new instance for one of the branch.
     val joined = sqlContext.executePlan(
@@ -502,7 +521,7 @@ class DataFrame private[sql](
       Join(
         joined.left,
         joined.right,
-        joinType = Inner,
+        joinType = JoinType(joinType),
         condition)
     )
   }
@@ -720,7 +739,7 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def selectExpr(exprs: String*): DataFrame = {
     select(exprs.map { expr =>
-      Column(new SqlParser().parseExpression(expr))
+      Column(SqlParser.parseExpression(expr))
     }: _*)
   }
 
@@ -745,7 +764,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def filter(conditionExpr: String): DataFrame = {
-    filter(Column(new SqlParser().parseExpression(conditionExpr)))
+    filter(Column(SqlParser.parseExpression(conditionExpr)))
   }
 
   /**
@@ -769,7 +788,7 @@ class DataFrame private[sql](
    * @since 1.5.0
    */
   def where(conditionExpr: String): DataFrame = {
-    filter(Column(new SqlParser().parseExpression(conditionExpr)))
+    filter(Column(SqlParser.parseExpression(conditionExpr)))
   }
 
   /**
@@ -1288,15 +1307,11 @@ class DataFrame private[sql](
   @scala.annotation.varargs
   def describe(cols: String*): DataFrame = {
 
-    // TODO: Add stddev as an expression, and remove it from here.
-    def stddevExpr(expr: Expression): Expression =
-      Sqrt(Subtract(Average(Multiply(expr, expr)), Multiply(Average(expr), Average(expr))))
-
     // The list of summary statistics to compute, in the form of expressions.
     val statistics = List[(String, Expression => Expression)](
       "count" -> Count,
       "mean" -> Average,
-      "stddev" -> stddevExpr,
+      "stddev" -> Stddev,
       "min" -> Min,
       "max" -> Max)
 
@@ -1400,7 +1415,7 @@ class DataFrame private[sql](
    * @since 1.3.0
    */
   def collect(): Array[Row] = withNewExecutionId {
-    queryExecution.executedPlan.executeCollect()
+    queryExecution.executedPlan.executeCollectPublic()
   }
 
   /**
@@ -1549,7 +1564,7 @@ class DataFrame private[sql](
    */
   def toJSON: RDD[String] = {
     val rowSchema = this.schema
-    this.mapPartitions { iter =>
+    queryExecution.toRdd.mapPartitions { iter =>
       val writer = new CharArrayWriter()
       // create the Generator without separator inserted between 2 records
       val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
@@ -1580,7 +1595,7 @@ class DataFrame private[sql](
    */
   def inputFiles: Array[String] = {
     val files: Seq[String] = logicalPlan.collect {
-      case LogicalRelation(fsBasedRelation: FileRelation) =>
+      case LogicalRelation(fsBasedRelation: FileRelation, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
         fr.inputFiles
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index b2a66dd417b4c..03e973666e888 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -163,7 +163,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def insertInto(tableName: String): Unit = {
-    insertInto(new SqlParser().parseTableIdentifier(tableName))
+    insertInto(SqlParser.parseTableIdentifier(tableName))
   }
 
   private def insertInto(tableIdent: TableIdentifier): Unit = {
@@ -197,7 +197,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def saveAsTable(tableName: String): Unit = {
-    saveAsTable(new SqlParser().parseTableIdentifier(tableName))
+    saveAsTable(SqlParser.parseTableIdentifier(tableName))
   }
 
   private def saveAsTable(tableIdent: TableIdentifier): Unit = {
@@ -255,7 +255,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
     val conn = JdbcUtils.createConnection(url, props)
 
     try {
-      var tableExists = JdbcUtils.tableExists(conn, table)
+      var tableExists = JdbcUtils.tableExists(conn, url, table)
 
       if (mode == SaveMode.Ignore && tableExists) {
         return
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index ee31d83cce42c..102b802ad0a0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -124,6 +124,9 @@ class GroupedData protected[sql](
       case "avg" | "average" | "mean" => Average
       case "max" => Max
       case "min" => Min
+      case "stddev" => Stddev
+      case "stddev_pop" => StddevPop
+      case "stddev_samp" => StddevSamp
       case "sum" => Sum
       case "count" | "size" =>
         // Turn count(*) into count(1)
@@ -283,6 +286,42 @@ class GroupedData protected[sql](
     aggregateNumericColumns(colNames : _*)(Min)
   }
 
+  /**
+   * Compute the sample standard deviation for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the stddev for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def stddev(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Stddev)
+  }
+
+  /**
+   * Compute the population standard deviation for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the stddev for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def stddev_pop(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(StddevPop)
+  }
+
+  /**
+   * Compute the sample standard deviation for each numeric columns for each group.
+   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the stddev for them.
+   *
+   * @since 1.6.0
+   */
+  @scala.annotation.varargs
+  def stddev_samp(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(StddevSamp)
+  }
+
   /**
    * Compute the sum for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 9de75f4c4d084..b9fb90d964206 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -330,11 +330,6 @@ private[spark] object SQLConf {
 
   // Options that control which operators can be chosen by the query planner.  These should be
   // considered hints and may be ignored by future versions of Spark SQL.
-  val EXTERNAL_SORT = booleanConf("spark.sql.planner.externalSort",
-    defaultValue = Some(true),
-    doc = "When true, performs sorts spilling to disk as needed otherwise sort each partition in" +
-      " memory.")
-
   val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
     defaultValue = Some(true),
     doc = "When true, use sort merge join (as opposed to hash join) by default for large joins.")
@@ -422,6 +417,7 @@ private[spark] object SQLConf {
 
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
+    val EXTERNAL_SORT = "spark.sql.planner.externalSort"
   }
 }
 
@@ -476,8 +472,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT)
-
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
   private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, getConf(TUNGSTEN_ENABLED))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 4e8414af50b44..cb0a3e361c97a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.beans.Introspector
+import java.beans.{BeanInfo, Introspector}
 import java.util.Properties
 import java.util.concurrent.atomic.AtomicReference
 
@@ -38,6 +38,10 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _}
+import org.apache.spark.sql.execution.{Filter, _}
+import org.apache.spark.sql.{execution => sparkexecution}
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
@@ -188,9 +192,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   protected[sql] def parseSql(sql: String): LogicalPlan = ddlParser.parse(sql, false)
 
-  protected[sql] def executeSql(sql: String): this.QueryExecution = executePlan(parseSql(sql))
+  protected[sql] def executeSql(sql: String):
+    org.apache.spark.sql.execution.QueryExecution = executePlan(parseSql(sql))
 
-  protected[sql] def executePlan(plan: LogicalPlan) = new this.QueryExecution(plan)
+  protected[sql] def executePlan(plan: LogicalPlan) =
+    new sparkexecution.QueryExecution(this, plan)
 
   @transient
   protected[sql] val tlSession = new ThreadLocal[SQLSession]() {
@@ -470,6 +476,20 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rowRDD.rdd, schema)
   }
 
+  /**
+   * :: DeveloperApi ::
+   * Creates a [[DataFrame]] from an [[java.util.List]] containing [[Row]]s using the given schema.
+   * It is important to make sure that the structure of every [[Row]] of the provided List matches
+   * the provided schema. Otherwise, there will be runtime exception.
+   *
+   * @group dataframes
+   * @since 1.6.0
+   */
+  @DeveloperApi
+  def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = {
+    DataFrame(self, LocalRelation.fromExternalRows(schema.toAttributes, rows.asScala))
+  }
+
   /**
    * Applies a schema to an RDD of Java Beans.
    *
@@ -479,21 +499,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @since 1.3.0
    */
   def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
-    val attributeSeq = getSchema(beanClass)
+    val attributeSeq: Seq[AttributeReference] = getSchema(beanClass)
     val className = beanClass.getName
     val rowRdd = rdd.mapPartitions { iter =>
       // BeanInfo is not serializable so we must rediscover it remotely for each partition.
       val localBeanInfo = Introspector.getBeanInfo(Utils.classForName(className))
-      val extractors =
-        localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
-      val methodsToConverts = extractors.zip(attributeSeq).map { case (e, attr) =>
-        (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType))
-      }
-      iter.map { row =>
-        new GenericInternalRow(
-          methodsToConverts.map { case (e, convert) => convert(e.invoke(row)) }.toArray[Any]
-        ): InternalRow
-      }
+      SQLContext.beansToRows(iter, localBeanInfo, attributeSeq)
     }
     DataFrame(this, LogicalRDD(attributeSeq, rowRdd)(this))
   }
@@ -510,6 +521,23 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rdd.rdd, beanClass)
   }
 
+  /**
+   * Applies a schema to an List of Java Beans.
+   *
+   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
+   *          SELECT * queries will return the columns in an undefined order.
+   * @group dataframes
+   * @since 1.6.0
+   */
+  def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = {
+    val attrSeq = getSchema(beanClass)
+    val className = beanClass.getName
+    val beanInfo = Introspector.getBeanInfo(beanClass)
+    val rows = SQLContext.beansToRows(data.asScala.iterator, beanInfo, attrSeq)
+    DataFrame(self, LocalRelation(attrSeq, rows.toSeq))
+  }
+
+
   /**
    * :: Experimental ::
    * Returns a [[DataFrameReader]] that can be used to read data in as a [[DataFrame]].
@@ -584,7 +612,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       tableName: String,
       source: String,
       options: Map[String, String]): DataFrame = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     val cmd =
       CreateTableUsing(
         tableIdent,
@@ -630,7 +658,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       source: String,
       schema: StructType,
       options: Map[String, String]): DataFrame = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     val cmd =
       CreateTableUsing(
         tableIdent,
@@ -726,7 +754,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @since 1.3.0
    */
   def table(tableName: String): DataFrame = {
-    table(new SqlParser().parseTableIdentifier(tableName))
+    table(SqlParser.parseTableIdentifier(tableName))
   }
 
   private def table(tableIdent: TableIdentifier): DataFrame = {
@@ -781,77 +809,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }.toArray
   }
 
-  protected[sql] class SparkPlanner extends SparkStrategies {
-    val sparkContext: SparkContext = self.sparkContext
-
-    val sqlContext: SQLContext = self
-
-    def codegenEnabled: Boolean = self.conf.codegenEnabled
-
-    def unsafeEnabled: Boolean = self.conf.unsafeEnabled
-
-    def numPartitions: Int = self.conf.numShufflePartitions
-
-    def strategies: Seq[Strategy] =
-      experimental.extraStrategies ++ (
-      DataSourceStrategy ::
-      DDLStrategy ::
-      TakeOrderedAndProject ::
-      HashAggregation ::
-      Aggregation ::
-      LeftSemiJoin ::
-      EquiJoinSelection ::
-      InMemoryScans ::
-      BasicOperators ::
-      CartesianProduct ::
-      BroadcastNestedLoopJoin :: Nil)
-
-    /**
-     * Used to build table scan operators where complex projection and filtering are done using
-     * separate physical operators.  This function returns the given scan operator with Project and
-     * Filter nodes added only when needed.  For example, a Project operator is only used when the
-     * final desired output requires complex expressions to be evaluated or when columns can be
-     * further eliminated out after filtering has been done.
-     *
-     * The `prunePushedDownFilters` parameter is used to remove those filters that can be optimized
-     * away by the filter pushdown optimization.
-     *
-     * The required attributes for both filtering and expression evaluation are passed to the
-     * provided `scanBuilder` function so that it can avoid unnecessary column materialization.
-     */
-    def pruneFilterProject(
-        projectList: Seq[NamedExpression],
-        filterPredicates: Seq[Expression],
-        prunePushedDownFilters: Seq[Expression] => Seq[Expression],
-        scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = {
-
-      val projectSet = AttributeSet(projectList.flatMap(_.references))
-      val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
-      val filterCondition =
-        prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And)
-
-      // Right now we still use a projection even if the only evaluation is applying an alias
-      // to a column.  Since this is a no-op, it could be avoided. However, using this
-      // optimization with the current implementation would change the output schema.
-      // TODO: Decouple final output schema from expression evaluation so this copy can be
-      // avoided safely.
-
-      if (AttributeSet(projectList.map(_.toAttribute)) == projectSet &&
-          filterSet.subsetOf(projectSet)) {
-        // When it is possible to just use column pruning to get the right projection and
-        // when the columns of this projection are enough to evaluate all filter conditions,
-        // just do a scan followed by a filter, with no extra project.
-        val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]])
-        filterCondition.map(Filter(_, scan)).getOrElse(scan)
-      } else {
-        val scan = scanBuilder((projectSet ++ filterSet).toSeq)
-        Project(projectList, filterCondition.map(Filter(_, scan)).getOrElse(scan))
-      }
-    }
-  }
+  @deprecated("use org.apache.spark.sql.SparkPlanner", "1.6.0")
+  protected[sql] class SparkPlanner extends sparkexecution.SparkPlanner(this)
 
   @transient
-  protected[sql] val planner = new SparkPlanner
+  protected[sql] val planner: sparkexecution.SparkPlanner = new sparkexecution.SparkPlanner(this)
 
   @transient
   protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[InternalRow], 1)
@@ -898,59 +860,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
     protected[sql] lazy val conf: SQLConf = new SQLConf
   }
 
-  /**
-   * :: DeveloperApi ::
-   * The primary workflow for executing relational queries using Spark.  Designed to allow easy
-   * access to the intermediate phases of query execution for developers.
-   */
-  @DeveloperApi
-  protected[sql] class QueryExecution(val logical: LogicalPlan) {
-    def assertAnalyzed(): Unit = analyzer.checkAnalysis(analyzed)
-
-    lazy val analyzed: LogicalPlan = analyzer.execute(logical)
-    lazy val withCachedData: LogicalPlan = {
-      assertAnalyzed()
-      cacheManager.useCachedData(analyzed)
-    }
-    lazy val optimizedPlan: LogicalPlan = optimizer.execute(withCachedData)
-
-    // TODO: Don't just pick the first one...
-    lazy val sparkPlan: SparkPlan = {
-      SparkPlan.currentContext.set(self)
-      planner.plan(optimizedPlan).next()
-    }
-    // executedPlan should not be used to initialize any SparkPlan. It should be
-    // only used for execution.
-    lazy val executedPlan: SparkPlan = prepareForExecution.execute(sparkPlan)
-
-    /** Internal version of the RDD. Avoids copies and has no schema */
-    lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
-
-    protected def stringOrError[A](f: => A): String =
-      try f.toString catch { case e: Throwable => e.toString }
-
-    def simpleString: String =
-      s"""== Physical Plan ==
-         |${stringOrError(executedPlan)}
-      """.stripMargin.trim
-
-    override def toString: String = {
-      def output =
-        analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ")
-
-      s"""== Parsed Logical Plan ==
-         |${stringOrError(logical)}
-         |== Analyzed Logical Plan ==
-         |${stringOrError(output)}
-         |${stringOrError(analyzed)}
-         |== Optimized Logical Plan ==
-         |${stringOrError(optimizedPlan)}
-         |== Physical Plan ==
-         |${stringOrError(executedPlan)}
-         |Code Generation: ${stringOrError(executedPlan.codegenEnabled)}
-      """.stripMargin.trim
-    }
-  }
+  @deprecated("use org.apache.spark.sql.QueryExecution", "1.6.0")
+  protected[sql] class QueryExecution(logical: LogicalPlan)
+    extends sparkexecution.QueryExecution(this, logical)
 
   /**
    * Parses the data type in our internal string representation. The data type string should
@@ -1325,4 +1237,23 @@ object SQLContext {
       lastInstantiatedContext.set(sqlContext)
     }
   }
+
+  /**
+   * Converts an iterator of Java Beans to InternalRow using the provided
+   * bean info & schema. This is not related to the singleton, but is a static
+   * method for internal use.
+   */
+  private def beansToRows(data: Iterator[_], beanInfo: BeanInfo, attrs: Seq[AttributeReference]):
+      Iterator[InternalRow] = {
+    val extractors =
+      beanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
+    val methodsToConverts = extractors.zip(attrs).map { case (e, attr) =>
+      (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType))
+    }
+    data.map{ element =>
+      new GenericInternalRow(
+        methodsToConverts.map { case (e, convert) => convert(e.invoke(element)) }.toArray[Any]
+      ): InternalRow
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 7f3defec3d42e..f45d119c8cfdf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpres
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, GroupedData, Row, SQLContext, SaveMode}
 
+import scala.util.matching.Regex
+
 private[r] object SQLUtils {
   def createSQLContext(jsc: JavaSparkContext): SQLContext = {
     new SQLContext(jsc)
@@ -35,14 +37,15 @@ private[r] object SQLUtils {
     new JavaSparkContext(sqlCtx.sparkContext)
   }
 
-  def toSeq[T](arr: Array[T]): Seq[T] = {
-    arr.toSeq
-  }
-
   def createStructType(fields : Seq[StructField]): StructType = {
     StructType(fields)
   }
 
+  // Support using regex in string interpolation
+  private[this] implicit class RegexContext(sc: StringContext) {
+    def r: Regex = new Regex(sc.parts.mkString, sc.parts.tail.map(_ => "x"): _*)
+  }
+
   def getSQLDataType(dataType: String): DataType = {
     dataType match {
       case "byte" => org.apache.spark.sql.types.ByteType
@@ -58,6 +61,15 @@ private[r] object SQLUtils {
       case "boolean" => org.apache.spark.sql.types.BooleanType
       case "timestamp" => org.apache.spark.sql.types.TimestampType
       case "date" => org.apache.spark.sql.types.DateType
+      case r"\Aarray<(.*)${elemType}>\Z" => {
+        org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
+      }
+      case r"\Amap<(.*)${keyType},(.*)${valueType}>\Z" => {
+        if (keyType != "string" && keyType != "character") {
+          throw new IllegalArgumentException("Key type of a map must be string or character")
+        }
+        org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType))
+      }
       case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 5cbd52bc0590e..fbd51b7c34c8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -213,8 +213,8 @@ private[sql] class StringColumnStats extends ColumnStats {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
       val value = row.getUTF8String(ordinal)
-      if (upper == null || value.compareTo(upper) > 0) upper = value
-      if (lower == null || value.compareTo(lower) < 0) lower = value
+      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
+      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
       sizeInBytes += STRING.actualSize(row, ordinal)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 531a8244d55d1..ab482a3636121 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -339,6 +339,8 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
   override def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int) {
     setField(to, toOrdinal, getField(from, fromOrdinal))
   }
+
+  override def clone(v: UTF8String): UTF8String = v.clone()
 }
 
 private[sql] object DATE extends NativeColumnType(DateType, 8, 4) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 66d429bc06198..d7e145f9c2bb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -48,10 +48,10 @@ private[sql] case class InMemoryRelation(
     useCompression: Boolean,
     batchSize: Int,
     storageLevel: StorageLevel,
-    child: SparkPlan,
+    @transient child: SparkPlan,
     tableName: Option[String])(
-    private var _cachedColumnBuffers: RDD[CachedBatch] = null,
-    private var _statistics: Statistics = null,
+    @transient private var _cachedColumnBuffers: RDD[CachedBatch] = null,
+    @transient private var _statistics: Statistics = null,
     private var _batchStats: Accumulable[ArrayBuffer[InternalRow], InternalRow] = null)
   extends LogicalPlan with MultiInstanceRelation {
 
@@ -62,7 +62,7 @@ private[sql] case class InMemoryRelation(
       _batchStats
     }
 
-  val partitionStatistics = new PartitionStatistics(output)
+  @transient val partitionStatistics = new PartitionStatistics(output)
 
   private def computeSizeInBytes = {
     val sizeOfRow: Expression =
@@ -196,7 +196,7 @@ private[sql] case class InMemoryRelation(
 private[sql] case class InMemoryColumnarTableScan(
     attributes: Seq[Attribute],
     predicates: Seq[Expression],
-    relation: InMemoryRelation)
+    @transient relation: InMemoryRelation)
   extends LeafNode {
 
   override def output: Seq[Attribute] = attributes
@@ -205,7 +205,7 @@ private[sql] case class InMemoryColumnarTableScan(
 
   // Returned filter predicate should return false iff it is impossible for the input expression
   // to evaluate to `true' based on statistics collected about this partition batch.
-  val buildFilter: PartialFunction[Expression, Expression] = {
+  @transient val buildFilter: PartialFunction[Expression, Expression] = {
     case And(lhs: Expression, rhs: Expression)
       if buildFilter.isDefinedAt(lhs) || buildFilter.isDefinedAt(rhs) =>
       (buildFilter.lift(lhs) ++ buildFilter.lift(rhs)).reduce(_ && _)
@@ -268,16 +268,23 @@ private[sql] case class InMemoryColumnarTableScan(
       readBatches.setValue(0)
     }
 
-    relation.cachedColumnBuffers.mapPartitions { cachedBatchIterator =>
-      val partitionFilter = newPredicate(
-        partitionFilters.reduceOption(And).getOrElse(Literal(true)),
-        relation.partitionStatistics.schema)
+    // Using these variables here to avoid serialization of entire objects (if referenced directly)
+    // within the map Partitions closure.
+    val schema = relation.partitionStatistics.schema
+    val schemaIndex = schema.zipWithIndex
+    val relOutput = relation.output
+    val buffers = relation.cachedColumnBuffers
+
+    buffers.mapPartitions { cachedBatchIterator =>
+    val partitionFilter = newPredicate(
+      partitionFilters.reduceOption(And).getOrElse(Literal(true)),
+      schema)
 
       // Find the ordinals and data types of the requested columns.  If none are requested, use the
       // narrowest (the field with minimum default element size).
       val (requestedColumnIndices, requestedColumnDataTypes) = if (attributes.isEmpty) {
         val (narrowestOrdinal, narrowestDataType) =
-          relation.output.zipWithIndex.map { case (a, ordinal) =>
+          relOutput.zipWithIndex.map { case (a, ordinal) =>
             ordinal -> a.dataType
           } minBy { case (_, dataType) =>
             ColumnType(dataType).defaultSize
@@ -285,7 +292,7 @@ private[sql] case class InMemoryColumnarTableScan(
         Seq(narrowestOrdinal) -> Seq(narrowestDataType)
       } else {
         attributes.map { a =>
-          relation.output.indexWhere(_.exprId == a.exprId) -> a.dataType
+          relOutput.indexWhere(_.exprId == a.exprId) -> a.dataType
         }.unzip
       }
 
@@ -296,7 +303,7 @@ private[sql] case class InMemoryColumnarTableScan(
           // Build column accessors
           val columnAccessors = requestedColumnIndices.map { batchColumnIndex =>
             ColumnAccessor(
-              relation.output(batchColumnIndex).dataType,
+              relOutput(batchColumnIndex).dataType,
               ByteBuffer.wrap(cachedBatch.buffers(batchColumnIndex)))
           }
 
@@ -328,7 +335,7 @@ private[sql] case class InMemoryColumnarTableScan(
         if (inMemoryPartitionPruningEnabled) {
           cachedBatchIterator.filter { cachedBatch =>
             if (!partitionFilter(cachedBatch.stats)) {
-              def statsString: String = relation.partitionStatistics.schema.zipWithIndex.map {
+              def statsString: String = schemaIndex.map {
                 case (a, i) =>
                   val value = cachedBatch.stats.get(i, a.dataType)
                   s"${a.name}: $value"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index 34e926e4582be..adb6bbc4acc5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -34,13 +34,11 @@ private[sql] case class LocalTableScan(
 
   protected override def doExecute(): RDD[InternalRow] = rdd
 
-  override def executeCollect(): Array[Row] = {
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    rows.map(converter(_).asInstanceOf[Row]).toArray
+  override def executeCollect(): Array[InternalRow] = {
+    rows.toArray
   }
 
-  override def executeTake(limit: Int): Array[Row] = {
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    rows.map(converter(_).asInstanceOf[Row]).take(limit).toArray
+  override def executeTake(limit: Int): Array[InternalRow] = {
+    rows.take(limit).toArray
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
new file mode 100644
index 0000000000000..7bb4133a29059
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.annotation.{Experimental, DeveloperApi}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.{InternalRow, optimizer}
+import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+
+/**
+ * :: DeveloperApi ::
+ * The primary workflow for executing relational queries using Spark.  Designed to allow easy
+ * access to the intermediate phases of query execution for developers.
+ */
+@DeveloperApi
+class QueryExecution(val sqlContext: SQLContext, val logical: LogicalPlan) {
+  val analyzer = sqlContext.analyzer
+  val optimizer = sqlContext.optimizer
+  val planner = sqlContext.planner
+  val cacheManager = sqlContext.cacheManager
+  val prepareForExecution = sqlContext.prepareForExecution
+
+  def assertAnalyzed(): Unit = analyzer.checkAnalysis(analyzed)
+
+  lazy val analyzed: LogicalPlan = analyzer.execute(logical)
+  lazy val withCachedData: LogicalPlan = {
+    assertAnalyzed()
+    cacheManager.useCachedData(analyzed)
+  }
+  lazy val optimizedPlan: LogicalPlan = optimizer.execute(withCachedData)
+
+  // TODO: Don't just pick the first one...
+  lazy val sparkPlan: SparkPlan = {
+    SparkPlan.currentContext.set(sqlContext)
+    planner.plan(optimizedPlan).next()
+  }
+  // executedPlan should not be used to initialize any SparkPlan. It should be
+  // only used for execution.
+  lazy val executedPlan: SparkPlan = prepareForExecution.execute(sparkPlan)
+
+  /** Internal version of the RDD. Avoids copies and has no schema */
+  lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
+
+  protected def stringOrError[A](f: => A): String =
+    try f.toString catch { case e: Throwable => e.toString }
+
+  def simpleString: String =
+    s"""== Physical Plan ==
+       |${stringOrError(executedPlan)}
+      """.stripMargin.trim
+
+
+  override def toString: String = {
+    def output =
+      analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ")
+
+    s"""== Parsed Logical Plan ==
+       |${stringOrError(logical)}
+       |== Analyzed Logical Plan ==
+       |${stringOrError(output)}
+       |${stringOrError(analyzed)}
+       |== Optimized Logical Plan ==
+       |${stringOrError(optimizedPlan)}
+       |== Physical Plan ==
+       |${stringOrError(executedPlan)}
+       |Code Generation: ${stringOrError(executedPlan.codegenEnabled)}
+    """.stripMargin.trim
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index cee58218a885b..1422e15549c94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -37,7 +37,7 @@ private[sql] object SQLExecution {
    * we can connect them with an execution.
    */
   def withNewExecutionId[T](
-      sqlContext: SQLContext, queryExecution: SQLContext#QueryExecution)(body: => T): T = {
+      sqlContext: SQLContext, queryExecution: QueryExecution)(body: => T): T = {
     val sc = sqlContext.sparkContext
     val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY)
     if (oldExecutionId == null) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index 88f5b13c8f248..743c99a899c61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -65,6 +65,12 @@ class ShuffledRowRDD(
     Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRowRDDPartition(i))
   }
 
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    val dep = dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+    tracker.getPreferredLocationsForShuffle(dep, partition.index)
+  }
+
   override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
     val dep = dependencies.head.asInstanceOf[ShuffleDependency[Int, InternalRow, InternalRow]]
     SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 72f5450510a10..fcb42047ffe60 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -170,11 +170,16 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   /**
    * Runs this query returning the result as an array.
    */
-  def executeCollect(): Array[Row] = {
-    execute().mapPartitions { iter =>
-      val converter = CatalystTypeConverters.createToScalaConverter(schema)
-      iter.map(converter(_).asInstanceOf[Row])
-    }.collect()
+  def executeCollect(): Array[InternalRow] = {
+    execute().map(_.copy()).collect()
+  }
+
+  /**
+   * Runs this query returning the result as an array, using external Row format.
+   */
+  def executeCollectPublic(): Array[Row] = {
+    val converter = CatalystTypeConverters.createToScalaConverter(schema)
+    executeCollect().map(converter(_).asInstanceOf[Row])
   }
 
   /**
@@ -182,9 +187,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
    *
    * This is modeled after RDD.take but never runs any job locally on the driver.
    */
-  def executeTake(n: Int): Array[Row] = {
+  def executeTake(n: Int): Array[InternalRow] = {
     if (n == 0) {
-      return new Array[Row](0)
+      return new Array[InternalRow](0)
     }
 
     val childRDD = execute().map(_.copy())
@@ -218,8 +223,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       partsScanned += numPartsToTry
     }
 
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    buf.toArray.map(converter(_).asInstanceOf[Row])
+    buf.toArray
   }
 
   private[this] def isTesting: Boolean = sys.props.contains("spark.testing")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
new file mode 100644
index 0000000000000..b346f43faebe2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
+
+@Experimental
+class SparkPlanner(val sqlContext: SQLContext) extends SparkStrategies {
+  val sparkContext: SparkContext = sqlContext.sparkContext
+
+  def codegenEnabled: Boolean = sqlContext.conf.codegenEnabled
+
+  def unsafeEnabled: Boolean = sqlContext.conf.unsafeEnabled
+
+  def numPartitions: Int = sqlContext.conf.numShufflePartitions
+
+  def strategies: Seq[Strategy] =
+    sqlContext.experimental.extraStrategies ++ (
+      DataSourceStrategy ::
+      DDLStrategy ::
+      TakeOrderedAndProject ::
+      HashAggregation ::
+      Aggregation ::
+      LeftSemiJoin ::
+      EquiJoinSelection ::
+      InMemoryScans ::
+      BasicOperators ::
+      CartesianProduct ::
+      BroadcastNestedLoopJoin :: Nil)
+
+  /**
+   * Used to build table scan operators where complex projection and filtering are done using
+   * separate physical operators.  This function returns the given scan operator with Project and
+   * Filter nodes added only when needed.  For example, a Project operator is only used when the
+   * final desired output requires complex expressions to be evaluated or when columns can be
+   * further eliminated out after filtering has been done.
+   *
+   * The `prunePushedDownFilters` parameter is used to remove those filters that can be optimized
+   * away by the filter pushdown optimization.
+   *
+   * The required attributes for both filtering and expression evaluation are passed to the
+   * provided `scanBuilder` function so that it can avoid unnecessary column materialization.
+   */
+  def pruneFilterProject(
+      projectList: Seq[NamedExpression],
+      filterPredicates: Seq[Expression],
+      prunePushedDownFilters: Seq[Expression] => Seq[Expression],
+      scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = {
+
+    val projectSet = AttributeSet(projectList.flatMap(_.references))
+    val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
+    val filterCondition =
+      prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And)
+
+    // Right now we still use a projection even if the only evaluation is applying an alias
+    // to a column.  Since this is a no-op, it could be avoided. However, using this
+    // optimization with the current implementation would change the output schema.
+    // TODO: Decouple final output schema from expression evaluation so this copy can be
+    // avoided safely.
+
+    if (AttributeSet(projectList.map(_.toAttribute)) == projectSet &&
+        filterSet.subsetOf(projectSet)) {
+      // When it is possible to just use column pruning to get the right projection and
+      // when the columns of this projection are enough to evaluate all filter conditions,
+      // just do a scan followed by a filter, with no extra project.
+      val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]])
+      filterCondition.map(Filter(_, scan)).getOrElse(scan)
+    } else {
+      val scan = scanBuilder((projectSet ++ filterSet).toSeq)
+      Project(projectList, filterCondition.map(Filter(_, scan)).getOrElse(scan))
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 4df53687a0731..b078c8b6b05ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.sql.{SQLContext, Strategy, execution}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
-  self: SQLContext#SparkPlanner =>
+  self: SparkPlanner =>
 
   object LeftSemiJoin extends Strategy with PredicateHelper {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
@@ -87,7 +87,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         left: LogicalPlan,
         right: LogicalPlan,
         condition: Option[Expression],
-        side: joins.BuildSide) = {
+        side: joins.BuildSide): Seq[SparkPlan] = {
       val broadcastHashJoin = execution.joins.BroadcastHashJoin(
         leftKeys, rightKeys, side, planLater(left), planLater(right))
       condition.map(Filter(_, broadcastHashJoin)).getOrElse(broadcastHashJoin) :: Nil
@@ -123,24 +123,19 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // --- Outer joins --------------------------------------------------------------------------
 
       case ExtractEquiJoinKeys(
-             LeftOuter, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
+          LeftOuter, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
         joins.BroadcastHashOuterJoin(
           leftKeys, rightKeys, LeftOuter, condition, planLater(left), planLater(right)) :: Nil
 
       case ExtractEquiJoinKeys(
-             RightOuter, leftKeys, rightKeys, condition, CanBroadcast(left), right) =>
+          RightOuter, leftKeys, rightKeys, condition, CanBroadcast(left), right) =>
         joins.BroadcastHashOuterJoin(
           leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil
 
-      case ExtractEquiJoinKeys(LeftOuter, leftKeys, rightKeys, condition, left, right)
+      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
         if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) =>
         joins.SortMergeOuterJoin(
-          leftKeys, rightKeys, LeftOuter, condition, planLater(left), planLater(right)) :: Nil
-
-      case ExtractEquiJoinKeys(RightOuter, leftKeys, rightKeys, condition, left, right)
-        if sqlContext.conf.sortMergeJoinEnabled && RowOrdering.isOrderable(leftKeys) =>
-        joins.SortMergeOuterJoin(
-          leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil
+          leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil
 
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) =>
         joins.ShuffledHashOuterJoin(
@@ -156,11 +151,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       // Aggregations that can be performed in two phases, before and after the shuffle.
       case PartialAggregation(
-             namedGroupingAttributes,
-             rewrittenAggregateExpressions,
-             groupingExpressions,
-             partialComputation,
-             child) if !canBeConvertedToNewAggregation(plan) =>
+          namedGroupingAttributes,
+          rewrittenAggregateExpressions,
+          groupingExpressions,
+          partialComputation,
+          child) if !canBeConvertedToNewAggregation(plan) =>
         execution.Aggregate(
           partial = false,
           namedGroupingAttributes,
@@ -226,7 +221,19 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
             }
 
             val aggregateOperator =
-              if (functionsWithDistinct.isEmpty) {
+              if (aggregateExpressions.map(_.aggregateFunction).exists(!_.supportsPartial)) {
+                if (functionsWithDistinct.nonEmpty) {
+                  sys.error("Distinct columns cannot exist in Aggregate operator containing " +
+                    "aggregate functions which don't support partial aggregation.")
+                } else {
+                  aggregate.Utils.planAggregateWithoutPartial(
+                    groupingExpressions,
+                    aggregateExpressions,
+                    aggregateFunctionMap,
+                    resultExpressions,
+                    planLater(child))
+                }
+              } else if (functionsWithDistinct.isEmpty) {
                 aggregate.Utils.planAggregateWithoutDistinct(
                   groupingExpressions,
                   aggregateExpressions,
@@ -317,8 +324,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       if (sqlContext.conf.unsafeEnabled && sqlContext.conf.codegenEnabled &&
         TungstenSort.supportsSchema(child.schema)) {
         execution.TungstenSort(sortExprs, global, child)
-      } else if (sqlContext.conf.externalSortEnabled) {
-        execution.ExternalSort(sortExprs, global, child)
       } else {
         execution.Sort(sortExprs, global, child)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
index 5c18558f9bde7..7e981268de392 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -45,16 +45,9 @@ private[sql] class UnsafeRowSerializer(numFields: Int) extends Serializer with S
 }
 
 private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInstance {
-
-  /**
-   * Marks the end of a stream written with [[serializeStream()]].
-   */
-  private[this] val EOF: Int = -1
-
   /**
    * Serializes a stream of UnsafeRows. Within the stream, each record consists of a record
    * length (stored as a 4-byte integer, written high byte first), followed by the record's bytes.
-   * The end of the stream is denoted by a record with the special length `EOF` (-1).
    */
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] var writeBuffer: Array[Byte] = new Array[Byte](4096)
@@ -72,7 +65,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
     override def writeKey[T: ClassTag](key: T): SerializationStream = {
       // The key is only needed on the map side when computing partition ids. It does not need to
       // be shuffled.
-      assert(key.isInstanceOf[Int])
+      assert(null == key || key.isInstanceOf[Int])
       this
     }
 
@@ -92,7 +85,6 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
 
     override def close(): Unit = {
       writeBuffer = null
-      dOut.writeInt(EOF)
       dOut.close()
     }
   }
@@ -104,12 +96,20 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
       private[this] var rowBuffer: Array[Byte] = new Array[Byte](1024)
       private[this] var row: UnsafeRow = new UnsafeRow()
       private[this] var rowTuple: (Int, UnsafeRow) = (0, row)
+      private[this] val EOF: Int = -1
 
       override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = {
         new Iterator[(Int, UnsafeRow)] {
-          private[this] var rowSize: Int = dIn.readInt()
-          if (rowSize == EOF) dIn.close()
 
+          private[this] def readSize(): Int = try {
+            dIn.readInt()
+          } catch {
+            case e: EOFException =>
+              dIn.close()
+              EOF
+          }
+
+          private[this] var rowSize: Int = readSize()
           override def hasNext: Boolean = rowSize != EOF
 
           override def next(): (Int, UnsafeRow) = {
@@ -118,7 +118,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
             }
             ByteStreams.readFully(dIn, rowBuffer, 0, rowSize)
             row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize)
-            rowSize = dIn.readInt() // read the next row's size
+            rowSize = readSize()
             if (rowSize == EOF) { // We are returning the last row in this stream
               dIn.close()
               val _rowTuple = rowTuple
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index 0269d6d4b7a1c..f8929530c5036 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -253,7 +253,11 @@ case class Window(
 
         // Get all relevant projections.
         val result = createResultProjection(unboundExpressions)
-        val grouping = newProjection(partitionSpec, child.output)
+        val grouping = if (child.outputsUnsafeRows) {
+          UnsafeProjection.create(partitionSpec, child.output)
+        } else {
+          newProjection(partitionSpec, child.output)
+        }
 
         // Manage the stream and the grouping.
         var nextRow: InternalRow = EmptyRow
@@ -277,7 +281,8 @@ case class Window(
         val numFrames = frames.length
         private[this] def fetchNextPartition() {
           // Collect all the rows in the current partition.
-          val currentGroup = nextGroup
+          // Before we start to fetch new input rows, make a copy of nextGroup.
+          val currentGroup = nextGroup.copy()
           rows = new CompactBuffer
           while (nextRowAvailable && nextGroup == currentGroup) {
             rows += nextRow.copy()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index abca373b0c4f9..62dbc07e883f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -26,7 +26,7 @@ import org.apache.spark.unsafe.KVIterator
 import scala.collection.mutable.ArrayBuffer
 
 /**
- * The base class of [[SortBasedAggregationIterator]] and [[UnsafeHybridAggregationIterator]].
+ * The base class of [[SortBasedAggregationIterator]].
  * It mainly contains two parts:
  * 1. It initializes aggregate functions.
  * 2. It creates two functions, `processRow` and `generateOutput` based on [[AggregateMode]] of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index d43d3dd9ffaae..1114fe6552bdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -40,6 +40,9 @@ sealed trait BufferSetterGetterUtils {
     var i = 0
     while (i < getters.length) {
       getters(i) = dataTypes(i) match {
+        case NullType =>
+          (row: InternalRow, ordinal: Int) => null
+
         case BooleanType =>
           (row: InternalRow, ordinal: Int) =>
             if (row.isNullAt(ordinal)) null else row.getBoolean(ordinal)
@@ -74,6 +77,14 @@ sealed trait BufferSetterGetterUtils {
           (row: InternalRow, ordinal: Int) =>
             if (row.isNullAt(ordinal)) null else row.getDecimal(ordinal, precision, scale)
 
+        case DateType =>
+          (row: InternalRow, ordinal: Int) =>
+            if (row.isNullAt(ordinal)) null else row.getInt(ordinal)
+
+        case TimestampType =>
+          (row: InternalRow, ordinal: Int) =>
+            if (row.isNullAt(ordinal)) null else row.getLong(ordinal)
+
         case other =>
           (row: InternalRow, ordinal: Int) =>
             if (row.isNullAt(ordinal)) null else row.get(ordinal, other)
@@ -92,6 +103,9 @@ sealed trait BufferSetterGetterUtils {
     var i = 0
     while (i < setters.length) {
       setters(i) = dataTypes(i) match {
+        case NullType =>
+          (row: MutableRow, ordinal: Int, value: Any) => row.setNullAt(ordinal)
+
         case b: BooleanType =>
           (row: MutableRow, ordinal: Int, value: Any) =>
             if (value != null) {
@@ -150,9 +164,23 @@ sealed trait BufferSetterGetterUtils {
 
         case dt: DecimalType =>
           val precision = dt.precision
+          (row: MutableRow, ordinal: Int, value: Any) =>
+            // To make it work with UnsafeRow, we cannot use setNullAt.
+            // Please see the comment of UnsafeRow's setDecimal.
+            row.setDecimal(ordinal, value.asInstanceOf[Decimal], precision)
+
+        case DateType =>
           (row: MutableRow, ordinal: Int, value: Any) =>
             if (value != null) {
-              row.setDecimal(ordinal, value.asInstanceOf[Decimal], precision)
+              row.setInt(ordinal, value.asInstanceOf[Int])
+            } else {
+              row.setNullAt(ordinal)
+            }
+
+        case TimestampType =>
+          (row: MutableRow, ordinal: Int, value: Any) =>
+            if (value != null) {
+              row.setLong(ordinal, value.asInstanceOf[Long])
             } else {
               row.setNullAt(ordinal)
             }
@@ -205,6 +233,7 @@ private[sql] class MutableAggregationBufferImpl (
       throw new IllegalArgumentException(
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
+
     toScalaConverters(i)(bufferValueGetters(i)(underlyingBuffer, offsets(i)))
   }
 
@@ -352,6 +381,10 @@ private[sql] case class ScalaUDAF(
     }
   }
 
+  private[this] lazy val outputToCatalystConverter: Any => Any = {
+    CatalystTypeConverters.createToCatalystConverter(dataType)
+  }
+
   // This buffer is only used at executor side.
   private[this] var inputAggregateBuffer: InputAggregationBuffer = null
 
@@ -424,7 +457,7 @@ private[sql] case class ScalaUDAF(
   override def eval(buffer: InternalRow): Any = {
     evalAggregateBuffer.underlyingInputBuffer = buffer
 
-    udaf.evaluate(evalAggregateBuffer)
+    outputToCatalystConverter(udaf.evaluate(evalAggregateBuffer))
   }
 
   override def toString: String = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index 80816a095ea8c..4f5e86cceb8ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -37,6 +37,57 @@ object Utils {
       UnsafeProjection.canSupport(groupingExpressions)
   }
 
+  def planAggregateWithoutPartial(
+      groupingExpressions: Seq[Expression],
+      aggregateExpressions: Seq[AggregateExpression2],
+      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      resultExpressions: Seq[NamedExpression],
+      child: SparkPlan): Seq[SparkPlan] = {
+
+    val namedGroupingExpressions = groupingExpressions.map {
+      case ne: NamedExpression => ne -> ne
+      // If the expression is not a NamedExpressions, we add an alias.
+      // So, when we generate the result of the operator, the Aggregate Operator
+      // can directly get the Seq of attributes representing the grouping expressions.
+      case other =>
+        val withAlias = Alias(other, other.toString)()
+        other -> withAlias
+    }
+    val groupExpressionMap = namedGroupingExpressions.toMap
+    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+
+    val completeAggregateExpressions = aggregateExpressions.map(_.copy(mode = Complete))
+    val completeAggregateAttributes =
+      completeAggregateExpressions.map {
+        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
+      }
+
+    val rewrittenResultExpressions = resultExpressions.map { expr =>
+      expr.transformDown {
+        case agg: AggregateExpression2 =>
+          aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
+        case expression =>
+          // We do not rely on the equality check at here since attributes may
+          // different cosmetically. Instead, we use semanticEquals.
+          groupExpressionMap.collectFirst {
+            case (expr, ne) if expr semanticEquals expression => ne.toAttribute
+          }.getOrElse(expression)
+      }.asInstanceOf[NamedExpression]
+    }
+
+    SortBasedAggregate(
+      requiredChildDistributionExpressions = Some(namedGroupingAttributes),
+      groupingExpressions = namedGroupingExpressions.map(_._2),
+      nonCompleteAggregateExpressions = Nil,
+      nonCompleteAggregateAttributes = Nil,
+      completeAggregateExpressions = completeAggregateExpressions,
+      completeAggregateAttributes = completeAggregateAttributes,
+      initialInputBufferOffset = 0,
+      resultExpressions = rewrittenResultExpressions,
+      child = child
+    ) :: Nil
+  }
+
   def planAggregateWithoutDistinct(
       groupingExpressions: Seq[Expression],
       aggregateExpressions: Seq[AggregateExpression2],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 3f68b05a24f44..3e49e0a35741d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -138,7 +138,7 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
  *                   will be ub - lb.
  * @param withReplacement Whether to sample with replacement.
  * @param seed the random seed
- * @param child the QueryPlan
+ * @param child the SparkPlan
  */
 @DeveloperApi
 case class Sample(
@@ -204,7 +204,7 @@ case class Limit(limit: Int, child: SparkPlan)
   override def output: Seq[Attribute] = child.output
   override def outputPartitioning: Partitioning = SinglePartition
 
-  override def executeCollect(): Array[Row] = child.executeTake(limit)
+  override def executeCollect(): Array[InternalRow] = child.executeTake(limit)
 
   protected override def doExecute(): RDD[InternalRow] = {
     val rdd: RDD[_ <: Product2[Boolean, InternalRow]] = if (sortBasedShuffleOn) {
@@ -258,9 +258,8 @@ case class TakeOrderedAndProject(
     projection.map(data.map(_)).getOrElse(data)
   }
 
-  override def executeCollect(): Array[Row] = {
-    val converter = CatalystTypeConverters.createToScalaConverter(schema)
-    collectData().map(converter(_).asInstanceOf[Row])
+  override def executeCollect(): Array[InternalRow] = {
+    collectData()
   }
 
   // TODO: Terminal split should be implemented differently from non-terminal split.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 95209e6634519..05ccc53830bd1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -54,20 +54,21 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
    * The `execute()` method of all the physical command classes should reference `sideEffectResult`
    * so that the command can be executed eagerly right after the command query is created.
    */
-  protected[sql] lazy val sideEffectResult: Seq[Row] = cmd.run(sqlContext)
+  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
+    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
+    cmd.run(sqlContext).map(converter(_).asInstanceOf[InternalRow])
+  }
 
   override def output: Seq[Attribute] = cmd.output
 
   override def children: Seq[SparkPlan] = Nil
 
-  override def executeCollect(): Array[Row] = sideEffectResult.toArray
+  override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
-  override def executeTake(limit: Int): Array[Row] = sideEffectResult.take(limit).toArray
+  override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val convert = CatalystTypeConverters.createToCatalystConverter(schema)
-    val converted = sideEffectResult.map(convert(_).asInstanceOf[InternalRow])
-    sqlContext.sparkContext.parallelize(converted, 1)
+    sqlContext.sparkContext.parallelize(sideEffectResult, 1)
   }
 
   override def argString: String = cmd.toString
@@ -105,6 +106,15 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
       }
       (keyValueOutput, runFunc)
 
+    case Some((SQLConf.Deprecated.EXTERNAL_SORT, Some(value))) =>
+      val runFunc = (sqlContext: SQLContext) => {
+        logWarning(
+          s"Property ${SQLConf.Deprecated.EXTERNAL_SORT} is deprecated and will be ignored. " +
+            s"External sort will continue to be used.")
+        Seq(Row(SQLConf.Deprecated.EXTERNAL_SORT, "true"))
+      }
+      (keyValueOutput, runFunc)
+
     // Configures a single property.
     case Some((key, Some(value))) =>
       val runFunc = (sqlContext: SQLContext) => {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 6c1ef6a6df887..918db8e7d083e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _}
@@ -37,21 +38,21 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
  */
 private[sql] object DataSourceStrategy extends Strategy with Logging {
   def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match {
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _)) =>
       pruneFilterProjectRaw(
         l,
         projects,
         filters,
         (a, f) => toCatalystRDD(l, a, t.buildScan(a, f))) :: Nil
 
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan, _)) =>
       pruneFilterProject(
         l,
         projects,
         filters,
         (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil
 
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _)) =>
       pruneFilterProject(
         l,
         projects,
@@ -59,7 +60,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         (a, _) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray))) :: Nil
 
     // Scanning partitioned HadoopFsRelation
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation))
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _))
         if t.partitionSpec.partitionColumns.nonEmpty =>
       val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
 
@@ -87,7 +88,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         selectedPartitions) :: Nil
 
     // Scanning non-partitioned HadoopFsRelation
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) =>
       // See buildPartitionedTableScan for the reason that we need to create a shard
       // broadcast HadoopConf.
       val sharedHadoopConf = SparkHadoopUtil.get.conf
@@ -100,16 +101,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         (a, f) =>
           toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f, t.paths, confBroadcast))) :: Nil
 
-    case l @ LogicalRelation(baseRelation: TableScan) =>
+    case l @ LogicalRelation(baseRelation: TableScan, _) =>
       execution.PhysicalRDD.createFromDataSource(
         l.output, toCatalystRDD(l, baseRelation.buildScan()), baseRelation) :: Nil
 
-    case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: InsertableRelation), part, query, overwrite, false) if part.isEmpty =>
+    case i @ logical.InsertIntoTable(l @ LogicalRelation(t: InsertableRelation, _),
+      part, query, overwrite, false) if part.isEmpty =>
       execution.ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil
 
     case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: HadoopFsRelation), part, query, overwrite, false) =>
+      l @ LogicalRelation(t: HadoopFsRelation, _), part, query, overwrite, false) =>
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
       execution.ExecutedCommand(InsertIntoHadoopFsRelation(t, query, mode)) :: Nil
 
@@ -121,7 +122,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       projections: Seq[NamedExpression],
       filters: Seq[Expression],
       partitionColumns: StructType,
-      partitions: Array[Partition]) = {
+      partitions: Array[Partition]): SparkPlan = {
     val relation = logicalRelation.relation.asInstanceOf[HadoopFsRelation]
 
     // Because we are creating one RDD per partition, we need to have a shared HadoopConf.
@@ -130,49 +131,51 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     val confBroadcast =
       relation.sqlContext.sparkContext.broadcast(new SerializableConfiguration(sharedHadoopConf))
 
-    // Builds RDD[Row]s for each selected partition.
-    val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
-      // The table scan operator (PhysicalRDD) which retrieves required columns from data files.
-      // Notice that the schema of data files, represented by `relation.dataSchema`, may contain
-      // some partition column(s).
-      val scan =
-        pruneFilterProject(
-          logicalRelation,
-          projections,
-          filters,
-          (columns: Seq[Attribute], filters) => {
-            val partitionColNames = partitionColumns.fieldNames
-
-            // Don't scan any partition columns to save I/O.  Here we are being optimistic and
-            // assuming partition columns data stored in data files are always consistent with those
-            // partition values encoded in partition directory paths.
-            val needed = columns.filterNot(a => partitionColNames.contains(a.name))
-            val dataRows =
-              relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
-
-            // Merges data values with partition values.
-            mergeWithPartitionValues(
-              relation.schema,
-              columns.map(_.name).toArray,
-              partitionColNames,
-              partitionValues,
-              toCatalystRDD(logicalRelation, needed, dataRows))
-          })
-
-      scan.execute()
-    }
+    // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder
+    // will union all partitions and attach partition values if needed.
+    val scanBuilder = {
+      (columns: Seq[Attribute], filters: Array[Filter]) => {
+        // Builds RDD[Row]s for each selected partition.
+        val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
+          val partitionColNames = partitionColumns.fieldNames
+
+          // Don't scan any partition columns to save I/O.  Here we are being optimistic and
+          // assuming partition columns data stored in data files are always consistent with those
+          // partition values encoded in partition directory paths.
+          val needed = columns.filterNot(a => partitionColNames.contains(a.name))
+          val dataRows =
+            relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
+
+          // Merges data values with partition values.
+          mergeWithPartitionValues(
+            relation.schema,
+            columns.map(_.name).toArray,
+            partitionColNames,
+            partitionValues,
+            toCatalystRDD(logicalRelation, needed, dataRows))
+        }
 
-    val unionedRows =
-      if (perPartitionRows.length == 0) {
-        relation.sqlContext.emptyResult
-      } else {
-        new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
+        val unionedRows =
+          if (perPartitionRows.length == 0) {
+            relation.sqlContext.emptyResult
+          } else {
+            new UnionRDD(relation.sqlContext.sparkContext, perPartitionRows)
+          }
+
+        unionedRows
       }
+    }
+
+    // Create the scan operator. If needed, add Filter and/or Project on top of the scan.
+    // The added Filter/Project is on top of the unioned RDD. We do not want to create
+    // one Filter/Project for every partition.
+    val sparkPlan = pruneFilterProject(
+      logicalRelation,
+      projections,
+      filters,
+      scanBuilder)
 
-    execution.PhysicalRDD.createFromDataSource(
-      projections.map(_.toAttribute),
-      unionedRows,
-      logicalRelation.relation)
+    sparkPlan
   }
 
   // TODO: refactor this thing. It is very complicated because it does projection internally.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index a7123dc845fa2..4069179aa734b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -17,23 +17,40 @@
 package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.sources.BaseRelation
 
 /**
  * Used to link a [[BaseRelation]] in to a logical query plan.
+ *
+ * Note that sometimes we need to use `LogicalRelation` to replace an existing leaf node without
+ * changing the output attributes' IDs.  The `expectedOutputAttributes` parameter is used for
+ * this purpose.  See https://issues.apache.org/jira/browse/SPARK-10741 for more details.
  */
-private[sql] case class LogicalRelation(relation: BaseRelation)
-  extends LeafNode
-  with MultiInstanceRelation {
+private[sql] case class LogicalRelation(
+    relation: BaseRelation,
+    expectedOutputAttributes: Option[Seq[Attribute]] = None)
+  extends LeafNode with MultiInstanceRelation {
 
-  override val output: Seq[AttributeReference] = relation.schema.toAttributes
+  override val output: Seq[AttributeReference] = {
+    val attrs = relation.schema.toAttributes
+    expectedOutputAttributes.map { expectedAttrs =>
+      assert(expectedAttrs.length == attrs.length)
+      attrs.zip(expectedAttrs).map {
+        // We should respect the attribute names provided by base relation and only use the
+        // exprId in `expectedOutputAttributes`.
+        // The reason is that, some relations(like parquet) will reconcile attribute names to
+        // workaround case insensitivity issue.
+        case (attr, expected) => attr.withExprId(expected.exprId)
+      }
+    }.getOrElse(attrs)
+  }
 
   // Logical Relations are distinct if they have different output for the sake of transformations.
   override def equals(other: Any): Boolean = other match {
-    case l @ LogicalRelation(otherRelation) => relation == otherRelation && output == l.output
-    case  _ => false
+    case l @ LogicalRelation(otherRelation, _) => relation == otherRelation && output == l.output
+    case _ => false
   }
 
   override def hashCode: Int = {
@@ -41,7 +58,7 @@ private[sql] case class LogicalRelation(relation: BaseRelation)
   }
 
   override def sameResult(otherPlan: LogicalPlan): Boolean = otherPlan match {
-    case LogicalRelation(otherRelation) => relation == otherRelation
+    case LogicalRelation(otherRelation, _) => relation == otherRelation
     case _ => false
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 78f48a5cd72c7..cfd64c1d9eb34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter}
 import org.apache.spark._
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.sql._
@@ -38,7 +39,7 @@ import org.apache.spark.util.SerializableConfiguration
 
 private[sql] abstract class BaseWriterContainer(
     @transient val relation: HadoopFsRelation,
-    @transient job: Job,
+    @transient private val job: Job,
     isAppend: Boolean)
   extends SparkHadoopMapReduceUtil
   with Logging
@@ -46,7 +47,8 @@ private[sql] abstract class BaseWriterContainer(
 
   protected val dataSchema = relation.dataSchema
 
-  protected val serializableConf = new SerializableConfiguration(job.getConfiguration)
+  protected val serializableConf =
+    new SerializableConfiguration(SparkHadoopUtil.get.getConfigurationFromJobContext(job))
 
   // This UUID is used to avoid output file name collision between different appending write jobs.
   // These jobs may belong to different SparkContext instances. Concrete data source implementations
@@ -88,7 +90,8 @@ private[sql] abstract class BaseWriterContainer(
     // This UUID is sent to executor side together with the serialized `Configuration` object within
     // the `Job` instance.  `OutputWriters` on the executor side should use this UUID to generate
     // unique task output files.
-    job.getConfiguration.set("spark.sql.sources.writeJobUUID", uniqueWriteJobId.toString)
+    SparkHadoopUtil.get.getConfigurationFromJobContext(job).
+      set("spark.sql.sources.writeJobUUID", uniqueWriteJobId.toString)
 
     // Order of the following two lines is important.  For Hadoop 1, TaskAttemptContext constructor
     // clones the Configuration object passed in.  If we initialize the TaskAttemptContext first,
@@ -145,7 +148,8 @@ private[sql] abstract class BaseWriterContainer(
           "because spark.speculation is configured to be true.")
       defaultOutputCommitter
     } else {
-      val committerClass = context.getConfiguration.getClass(
+      val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+      val committerClass = configuration.getClass(
         SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
 
       Option(committerClass).map { clazz =>
@@ -180,7 +184,9 @@ private[sql] abstract class BaseWriterContainer(
   private def setupIDs(jobId: Int, splitId: Int, attemptId: Int): Unit = {
     this.jobId = SparkHadoopWriter.createJobID(new Date, jobId)
     this.taskId = new TaskID(this.jobId, true, splitId)
+    // scalastyle:off jobcontext
     this.taskAttemptId = new TaskAttemptID(taskId, attemptId)
+    // scalastyle:on jobcontext
   }
 
   private def setupConf(): Unit = {
@@ -192,8 +198,7 @@ private[sql] abstract class BaseWriterContainer(
   }
 
   def commitTask(): Unit = {
-    SparkHadoopMapRedUtil.commitTask(
-      outputCommitter, taskAttemptContext, jobId.getId, taskId.getId, taskAttemptId.getId)
+    SparkHadoopMapRedUtil.commitTask(outputCommitter, taskAttemptContext, jobId.getId, taskId.getId)
   }
 
   def abortTask(): Unit = {
@@ -220,14 +225,15 @@ private[sql] abstract class BaseWriterContainer(
  * A writer that writes all of the rows in a partition to a single file.
  */
 private[sql] class DefaultWriterContainer(
-    @transient relation: HadoopFsRelation,
-    @transient job: Job,
+    relation: HadoopFsRelation,
+    job: Job,
     isAppend: Boolean)
   extends BaseWriterContainer(relation, job, isAppend) {
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = {
     executorSideSetup(taskContext)
-    taskAttemptContext.getConfiguration.set("spark.sql.sources.output.path", outputPath)
+    val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(taskAttemptContext)
+    configuration.set("spark.sql.sources.output.path", outputPath)
     val writer = outputWriterFactory.newInstance(getWorkPath, dataSchema, taskAttemptContext)
     writer.initConverter(dataSchema)
 
@@ -283,8 +289,8 @@ private[sql] class DefaultWriterContainer(
  * writer externally sorts the remaining rows and then writes out them out one file at a time.
  */
 private[sql] class DynamicPartitionWriterContainer(
-    @transient relation: HadoopFsRelation,
-    @transient job: Job,
+    relation: HadoopFsRelation,
+    job: Job,
     partitionColumns: Seq[Attribute],
     dataColumns: Seq[Attribute],
     inputSchema: Seq[Attribute],
@@ -395,7 +401,8 @@ private[sql] class DynamicPartitionWriterContainer(
     def newOutputWriter(key: InternalRow): OutputWriter = {
       val partitionPath = getPartitionString(key).getString(0)
       val path = new Path(getWorkPath, partitionPath)
-      taskAttemptContext.getConfiguration.set(
+      val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(taskAttemptContext)
+      configuration.set(
         "spark.sql.sources.output.path", new Path(outputPath, partitionPath).toString)
       val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext)
       newWriter.initConverter(dataSchema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 26788b2a4fd69..f89d55b20e212 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -42,10 +42,13 @@ object JdbcUtils extends Logging {
   /**
    * Returns true if the table already exists in the JDBC database.
    */
-  def tableExists(conn: Connection, table: String): Boolean = {
+  def tableExists(conn: Connection, url: String, table: String): Boolean = {
+    val dialect = JdbcDialects.get(url)
+
     // Somewhat hacky, but there isn't a good way to identify whether a table exists for all
-    // SQL database systems, considering "table" could also include the database name.
-    Try(conn.prepareStatement(s"SELECT 1 FROM $table LIMIT 1").executeQuery().next()).isSuccess
+    // SQL database systems using JDBC meta data calls, considering "table" could also include
+    // the database name. Query used to find table exists can be overriden by the dialects.
+    Try(conn.prepareStatement(dialect.getTableExistsQuery(table)).executeQuery()).isSuccess
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
index ab8ca5f748f24..d05e6efa83c84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
 import org.apache.spark.Logging
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -80,7 +81,7 @@ private[sql] class JSONRelation(
 
   private def createBaseRdd(inputPaths: Array[FileStatus]): RDD[String] = {
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-    val conf = job.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
 
     val paths = inputPaths.map(_.getPath)
 
@@ -169,8 +170,10 @@ private[json] class JsonOutputWriter(
   private val recordWriter: RecordWriter[NullWritable, Text] = {
     new TextOutputFormat[NullWritable, Text]() {
       override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-        val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-        val split = context.getTaskAttemptID.getTaskID.getId
+        val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+        val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+        val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+        val split = taskAttemptId.getTaskID.getId
         new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
       }
     }.getRecordWriter(context)
@@ -179,7 +182,7 @@ private[json] class JsonOutputWriter(
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
   override protected[sql] def writeInternal(row: InternalRow): Unit = {
-    JacksonGenerator(dataSchema, gen, row)
+    JacksonGenerator(dataSchema, gen)(row)
     gen.flush()
 
     result.set(writer.toString)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
index 330ba907b2ef9..d7d6edeb6c6d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.json
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 
 import scala.collection.Map
 
@@ -27,56 +28,6 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.types._
 
 private[sql] object JacksonGenerator {
-  /** Transforms a single Row to JSON using Jackson
-    *
-    * @param rowSchema the schema object used for conversion
-    * @param gen a JsonGenerator object
-    * @param row The row to convert
-    */
-  def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = {
-    def valWriter: (DataType, Any) => Unit = {
-      case (_, null) | (NullType, _) => gen.writeNull()
-      case (StringType, v: String) => gen.writeString(v)
-      case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
-      case (IntegerType, v: Int) => gen.writeNumber(v)
-      case (ShortType, v: Short) => gen.writeNumber(v)
-      case (FloatType, v: Float) => gen.writeNumber(v)
-      case (DoubleType, v: Double) => gen.writeNumber(v)
-      case (LongType, v: Long) => gen.writeNumber(v)
-      case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v)
-      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
-      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
-      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
-      case (DateType, v) => gen.writeString(v.toString)
-      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))
-
-      case (ArrayType(ty, _), v: Seq[_]) =>
-        gen.writeStartArray()
-        v.foreach(valWriter(ty, _))
-        gen.writeEndArray()
-
-      case (MapType(kv, vv, _), v: Map[_, _]) =>
-        gen.writeStartObject()
-        v.foreach { p =>
-          gen.writeFieldName(p._1.toString)
-          valWriter(vv, p._2)
-        }
-        gen.writeEndObject()
-
-      case (StructType(ty), v: Row) =>
-        gen.writeStartObject()
-        ty.zip(v.toSeq).foreach {
-          case (_, null) =>
-          case (field, v) =>
-            gen.writeFieldName(field.name)
-            valWriter(field.dataType, v)
-        }
-        gen.writeEndObject()
-    }
-
-    valWriter(rowSchema, row)
-  }
-
   /** Transforms a single InternalRow to JSON using Jackson
    *
    * TODO: make the code shared with the other apply method.
@@ -85,11 +36,11 @@ private[sql] object JacksonGenerator {
    * @param gen a JsonGenerator object
    * @param row The row to convert
    */
-  def apply(rowSchema: StructType, gen: JsonGenerator, row: InternalRow): Unit = {
+  def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = {
     def valWriter: (DataType, Any) => Unit = {
       case (_, null) | (NullType, _) => gen.writeNull()
       case (StringType, v) => gen.writeString(v.toString)
-      case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
+      case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString)
       case (IntegerType, v: Int) => gen.writeNumber(v)
       case (ShortType, v: Short) => gen.writeNumber(v)
       case (FloatType, v: Float) => gen.writeNumber(v)
@@ -99,8 +50,12 @@ private[sql] object JacksonGenerator {
       case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
       case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
       case (BooleanType, v: Boolean) => gen.writeBoolean(v)
-      case (DateType, v) => gen.writeString(v.toString)
-      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))
+      case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString)
+      // For UDT values, they should be in the SQL type's corresponding value type.
+      // We should not see values in the user-defined class at here.
+      // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is
+      // an ArrayData at here, instead of a Vector.
+      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v)
 
       case (ArrayType(ty, _), v: ArrayData) =>
         gen.writeStartArray()
@@ -128,6 +83,10 @@ private[sql] object JacksonGenerator {
           i += 1
         }
         gen.writeEndObject()
+
+      case (dt, v) =>
+        sys.error(
+          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
     }
 
     valWriter(rowSchema, row)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index cd68bd667c5c4..c51140749c8e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -62,10 +62,23 @@ private[sql] object JacksonParser {
         // guard the non string type
         null
 
+      case (VALUE_STRING, BinaryType) =>
+        parser.getBinaryValue
+
       case (VALUE_STRING, DateType) =>
-        DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
+        val stringValue = parser.getText
+        if (stringValue.contains("-")) {
+          // The format of this string will probably be "yyyy-mm-dd".
+          DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
+        } else {
+          // In Spark 1.5.0, we store the data as number of days since epoch in string.
+          // So, we just convert it to Int.
+          stringValue.toInt
+        }
 
       case (VALUE_STRING, TimestampType) =>
+        // This one will lose microseconds parts.
+        // See https://issues.apache.org/jira/browse/SPARK-10681.
         DateTimeUtils.stringToTime(parser.getText).getTime * 1000L
 
       case (VALUE_NUMBER_INT, TimestampType) =>
@@ -81,9 +94,37 @@ private[sql] object JacksonParser {
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) =>
         parser.getFloatValue
 
+      case (VALUE_STRING, FloatType) =>
+        // Special case handling for NaN and Infinity.
+        val value = parser.getText
+        val lowerCaseValue = value.toLowerCase()
+        if (lowerCaseValue.equals("nan") ||
+          lowerCaseValue.equals("infinity") ||
+          lowerCaseValue.equals("-infinity") ||
+          lowerCaseValue.equals("inf") ||
+          lowerCaseValue.equals("-inf")) {
+          value.toFloat
+        } else {
+          sys.error(s"Cannot parse $value as FloatType.")
+        }
+
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) =>
         parser.getDoubleValue
 
+      case (VALUE_STRING, DoubleType) =>
+        // Special case handling for NaN and Infinity.
+        val value = parser.getText
+        val lowerCaseValue = value.toLowerCase()
+        if (lowerCaseValue.equals("nan") ||
+          lowerCaseValue.equals("infinity") ||
+          lowerCaseValue.equals("-infinity") ||
+          lowerCaseValue.equals("inf") ||
+          lowerCaseValue.equals("-inf")) {
+          value.toDouble
+        } else {
+          sys.error(s"Cannot parse $value as DoubleType.")
+        }
+
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) =>
         Decimal(parser.getDecimalValue, dt.precision, dt.scale)
 
@@ -126,6 +167,9 @@ private[sql] object JacksonParser {
 
       case (_, udt: UserDefinedType[_]) =>
         convertField(factory, parser, udt.sqlType)
+
+      case (token, dataType) =>
+        sys.error(s"Failed to parse a value for data type $dataType (current token: $token).")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 0a6bb44445f6e..9502b835a54d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -25,37 +25,66 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
 import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
 import org.apache.parquet.io.api.RecordMaterializer
-import org.apache.parquet.schema.MessageType
+import org.apache.parquet.schema.Type.Repetition
+import org.apache.parquet.schema._
 
 import org.apache.spark.Logging
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types._
 
+/**
+ * A Parquet [[ReadSupport]] implementation for reading Parquet records as Catalyst
+ * [[InternalRow]]s.
+ *
+ * The API interface of [[ReadSupport]] is a little bit over complicated because of historical
+ * reasons.  In older versions of parquet-mr (say 1.6.0rc3 and prior), [[ReadSupport]] need to be
+ * instantiated and initialized twice on both driver side and executor side.  The [[init()]] method
+ * is for driver side initialization, while [[prepareForRead()]] is for executor side.  However,
+ * starting from parquet-mr 1.6.0, it's no longer the case, and [[ReadSupport]] is only instantiated
+ * and initialized on executor side.  So, theoretically, now it's totally fine to combine these two
+ * methods into a single initialization method.  The only reason (I could think of) to still have
+ * them here is for parquet-mr API backwards-compatibility.
+ *
+ * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]]
+ * to [[prepareForRead()]], but use a private `var` for simplicity.
+ */
 private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging {
-  // Called after `init()` when initializing Parquet record reader.
+  private var catalystRequestedSchema: StructType = _
+
+  /**
+   * Called on executor side before [[prepareForRead()]] and instantiating actual Parquet record
+   * readers.  Responsible for figuring out Parquet requested schema used for column pruning.
+   */
+  override def init(context: InitContext): ReadContext = {
+    catalystRequestedSchema = {
+      // scalastyle:off jobcontext
+      val conf = context.getConfiguration
+      // scalastyle:on jobcontext
+      val schemaString = conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+      assert(schemaString != null, "Parquet requested schema not set.")
+      StructType.fromString(schemaString)
+    }
+
+    val parquetRequestedSchema =
+      CatalystReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema)
+
+    new ReadContext(parquetRequestedSchema, Map.empty[String, String].asJava)
+  }
+
+  /**
+   * Called on executor side after [[init()]], before instantiating actual Parquet record readers.
+   * Responsible for instantiating [[RecordMaterializer]], which is used for converting Parquet
+   * records to Catalyst [[InternalRow]]s.
+   */
   override def prepareForRead(
       conf: Configuration,
       keyValueMetaData: JMap[String, String],
       fileSchema: MessageType,
       readContext: ReadContext): RecordMaterializer[InternalRow] = {
     log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
-
-    val toCatalyst = new CatalystSchemaConverter(conf)
     val parquetRequestedSchema = readContext.getRequestedSchema
 
-    val catalystRequestedSchema =
-      Option(readContext.getReadSupportMetadata).map(_.asScala).flatMap { metadata =>
-        metadata
-          // First tries to read requested schema, which may result from projections
-          .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
-          // If not available, tries to read Catalyst schema from file metadata.  It's only
-          // available if the target file is written by Spark SQL.
-          .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY))
-      }.map(StructType.fromString).getOrElse {
-        logInfo("Catalyst schema not available, falling back to Parquet schema")
-        toCatalyst.convert(parquetRequestedSchema)
-      }
-
     logInfo {
       s"""Going to read the following fields from the Parquet file:
          |
@@ -68,96 +97,178 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
 
     new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
   }
+}
 
-  // Called before `prepareForRead()` when initializing Parquet record reader.
-  override def init(context: InitContext): ReadContext = {
-    val conf = context.getConfiguration
-
-    // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
-    // schema of this file from its metadata.
-    val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
-
-    // Optional schema of requested columns, in the form of a string serialized from a Catalyst
-    // `StructType` containing all requested columns.
-    val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
-
-    // Below we construct a Parquet schema containing all requested columns.  This schema tells
-    // Parquet which columns to read.
-    //
-    // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema.  Otherwise,
-    // we have to fallback to the full file schema which contains all columns in the file.
-    // Obviously this may waste IO bandwidth since it may read more columns than requested.
-    //
-    // Two things to note:
-    //
-    // 1. It's possible that some requested columns don't exist in the target Parquet file.  For
-    //    example, in the case of schema merging, the globally merged schema may contain extra
-    //    columns gathered from other Parquet files.  These columns will be simply filled with nulls
-    //    when actually reading the target Parquet file.
-    //
-    // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to
-    //    Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to
-    //    non-standard behaviors of some Parquet libraries/tools.  For example, a Parquet file
-    //    containing a single integer array field `f1` may have the following legacy 2-level
-    //    structure:
-    //
-    //      message root {
-    //        optional group f1 (LIST) {
-    //          required INT32 element;
-    //        }
-    //      }
-    //
-    //    while `CatalystSchemaConverter` may generate a standard 3-level structure:
-    //
-    //      message root {
-    //        optional group f1 (LIST) {
-    //          repeated group list {
-    //            required INT32 element;
-    //          }
-    //        }
-    //      }
-    //
-    //    Apparently, we can't use the 2nd schema to read the target Parquet file as they have
-    //    different physical structures.
-    val parquetRequestedSchema =
-      maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
-        val toParquet = new CatalystSchemaConverter(conf)
-        val fileSchema = context.getFileSchema.asGroupType()
-        val fileFieldNames = fileSchema.getFields.asScala.map(_.getName).toSet
-
-        StructType
-          // Deserializes the Catalyst schema of requested columns
-          .fromString(schemaString)
-          .map { field =>
-            if (fileFieldNames.contains(field.name)) {
-              // If the field exists in the target Parquet file, extracts the field type from the
-              // full file schema and makes a single-field Parquet schema
-              new MessageType("root", fileSchema.getType(field.name))
-            } else {
-              // Otherwise, just resorts to `CatalystSchemaConverter`
-              toParquet.convert(StructType(Array(field)))
-            }
-          }
-          // Merges all single-field Parquet schemas to form a complete schema for all requested
-          // columns.  Note that it's possible that no columns are requested at all (e.g., count
-          // some partition column of a partitioned Parquet table). That's why `fold` is used here
-          // and always fallback to an empty Parquet schema.
-          .fold(new MessageType("root")) {
-            _ union _
-          }
+private[parquet] object CatalystReadSupport {
+  val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
+
+  val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
+
+  /**
+   * Tailors `parquetSchema` according to `catalystSchema` by removing column paths don't exist
+   * in `catalystSchema`, and adding those only exist in `catalystSchema`.
+   */
+  def clipParquetSchema(parquetSchema: MessageType, catalystSchema: StructType): MessageType = {
+    val clippedParquetFields = clipParquetGroupFields(parquetSchema.asGroupType(), catalystSchema)
+    Types.buildMessage().addFields(clippedParquetFields: _*).named("root")
+  }
+
+  private def clipParquetType(parquetType: Type, catalystType: DataType): Type = {
+    catalystType match {
+      case t: ArrayType if !isPrimitiveCatalystType(t.elementType) =>
+        // Only clips array types with nested type as element type.
+        clipParquetListType(parquetType.asGroupType(), t.elementType)
+
+      case t: MapType
+        if !isPrimitiveCatalystType(t.keyType) ||
+           !isPrimitiveCatalystType(t.valueType) =>
+        // Only clips map types with nested key type or value type
+        clipParquetMapType(parquetType.asGroupType(), t.keyType, t.valueType)
+
+      case t: StructType =>
+        clipParquetGroup(parquetType.asGroupType(), t)
+
+      case _ =>
+        // UDTs and primitive types are not clipped.  For UDTs, a clipped version might not be able
+        // to be mapped to desired user-space types.  So UDTs shouldn't participate schema merging.
+        parquetType
+    }
+  }
+
+  /**
+   * Whether a Catalyst [[DataType]] is primitive.  Primitive [[DataType]] is not equivalent to
+   * [[AtomicType]].  For example, [[CalendarIntervalType]] is primitive, but it's not an
+   * [[AtomicType]].
+   */
+  private def isPrimitiveCatalystType(dataType: DataType): Boolean = {
+    dataType match {
+      case _: ArrayType | _: MapType | _: StructType => false
+      case _ => true
+    }
+  }
+
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[ArrayType]].  The element type
+   * of the [[ArrayType]] should also be a nested type, namely an [[ArrayType]], a [[MapType]], or a
+   * [[StructType]].
+   */
+  private def clipParquetListType(parquetList: GroupType, elementType: DataType): Type = {
+    // Precondition of this method, should only be called for lists with nested element types.
+    assert(!isPrimitiveCatalystType(elementType))
+
+    // Unannotated repeated group should be interpreted as required list of required element, so
+    // list element type is just the group itself.  Clip it.
+    if (parquetList.getOriginalType == null && parquetList.isRepetition(Repetition.REPEATED)) {
+      clipParquetType(parquetList, elementType)
+    } else {
+      assert(
+        parquetList.getOriginalType == OriginalType.LIST,
+        "Invalid Parquet schema. " +
+          "Original type of annotated Parquet lists must be LIST: " +
+          parquetList.toString)
+
+      assert(
+        parquetList.getFieldCount == 1 && parquetList.getType(0).isRepetition(Repetition.REPEATED),
+        "Invalid Parquet schema. " +
+          "LIST-annotated group should only have exactly one repeated field: " +
+          parquetList)
+
+      // Precondition of this method, should only be called for lists with nested element types.
+      assert(!parquetList.getType(0).isPrimitive)
+
+      val repeatedGroup = parquetList.getType(0).asGroupType()
+
+      // If the repeated field is a group with multiple fields, or the repeated field is a group
+      // with one field and is named either "array" or uses the LIST-annotated group's name with
+      // "_tuple" appended then the repeated type is the element type and elements are required.
+      // Build a new LIST-annotated group with clipped `repeatedGroup` as element type and the
+      // only field.
+      if (
+        repeatedGroup.getFieldCount > 1 ||
+        repeatedGroup.getName == "array" ||
+        repeatedGroup.getName == parquetList.getName + "_tuple"
+      ) {
+        Types
+          .buildGroup(parquetList.getRepetition)
+          .as(OriginalType.LIST)
+          .addField(clipParquetType(repeatedGroup, elementType))
+          .named(parquetList.getName)
+      } else {
+        // Otherwise, the repeated field's type is the element type with the repeated field's
+        // repetition.
+        Types
+          .buildGroup(parquetList.getRepetition)
+          .as(OriginalType.LIST)
+          .addField(
+            Types
+              .repeatedGroup()
+              .addField(clipParquetType(repeatedGroup.getType(0), elementType))
+              .named(repeatedGroup.getName))
+          .named(parquetList.getName)
       }
+    }
+  }
 
-    val metadata =
-      Map.empty[String, String] ++
-        maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
-        maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[MapType]].  Either key type or
+   * value type of the [[MapType]] must be a nested type, namely an [[ArrayType]], a [[MapType]], or
+   * a [[StructType]].
+   */
+  private def clipParquetMapType(
+      parquetMap: GroupType, keyType: DataType, valueType: DataType): GroupType = {
+    // Precondition of this method, only handles maps with nested key types or value types.
+    assert(!isPrimitiveCatalystType(keyType) || !isPrimitiveCatalystType(valueType))
 
-    new ReadContext(parquetRequestedSchema, metadata.asJava)
+    val repeatedGroup = parquetMap.getType(0).asGroupType()
+    val parquetKeyType = repeatedGroup.getType(0)
+    val parquetValueType = repeatedGroup.getType(1)
+
+    val clippedRepeatedGroup =
+      Types
+        .repeatedGroup()
+        .as(repeatedGroup.getOriginalType)
+        .addField(clipParquetType(parquetKeyType, keyType))
+        .addField(clipParquetType(parquetValueType, valueType))
+        .named(repeatedGroup.getName)
+
+    Types
+      .buildGroup(parquetMap.getRepetition)
+      .as(parquetMap.getOriginalType)
+      .addField(clippedRepeatedGroup)
+      .named(parquetMap.getName)
   }
-}
 
-private[parquet] object CatalystReadSupport {
-  val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]].
+   *
+   * @return A clipped [[GroupType]], which has at least one field.
+   * @note Parquet doesn't allow creating empty [[GroupType]] instances except for empty
+   *       [[MessageType]].  Because it's legal to construct an empty requested schema for column
+   *       pruning.
+   */
+  private def clipParquetGroup(parquetRecord: GroupType, structType: StructType): GroupType = {
+    val clippedParquetFields = clipParquetGroupFields(parquetRecord, structType)
+    Types
+      .buildGroup(parquetRecord.getRepetition)
+      .as(parquetRecord.getOriginalType)
+      .addFields(clippedParquetFields: _*)
+      .named(parquetRecord.getName)
+  }
 
-  val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
+  /**
+   * Clips a Parquet [[GroupType]] which corresponds to a Catalyst [[StructType]].
+   *
+   * @return A list of clipped [[GroupType]] fields, which can be empty.
+   */
+  private def clipParquetGroupFields(
+      parquetRecord: GroupType, structType: StructType): Seq[Type] = {
+    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
+    val toParquet = new CatalystSchemaConverter(followParquetFormatSpec = true)
+    structType.map { f =>
+      parquetFieldMap
+        .get(f.name)
+        .map(clipParquetType(_, f.dataType))
+        .getOrElse(toParquet.convertField(f))
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index f682ca0d8ff4f..050d3610a6413 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -113,31 +113,6 @@ private[parquet] class CatalystPrimitiveConverter(val updater: ParentContainerUp
  * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have
  * any "parent" container.
  *
- * @note Constructor argument [[parquetType]] refers to requested fields of the actual schema of the
- *       Parquet file being read, while constructor argument [[catalystType]] refers to requested
- *       fields of the global schema.  The key difference is that, in case of schema merging,
- *       [[parquetType]] can be a subset of [[catalystType]].  For example, it's possible to have
- *       the following [[catalystType]]:
- *       {{{
- *         new StructType()
- *           .add("f1", IntegerType, nullable = false)
- *           .add("f2", StringType, nullable = true)
- *           .add("f3", new StructType()
- *             .add("f31", DoubleType, nullable = false)
- *             .add("f32", IntegerType, nullable = true)
- *             .add("f33", StringType, nullable = true), nullable = false)
- *       }}}
- *       and the following [[parquetType]] (`f2` and `f32` are missing):
- *       {{{
- *         message root {
- *           required int32 f1;
- *           required group f3 {
- *             required double f31;
- *             optional binary f33 (utf8);
- *           }
- *         }
- *       }}}
- *
  * @param parquetType Parquet schema of Parquet records
  * @param catalystType Spark SQL schema that corresponds to the Parquet record type
  * @param updater An updater which propagates converted field values to the parent container
@@ -148,6 +123,16 @@ private[parquet] class CatalystRowConverter(
     updater: ParentContainerUpdater)
   extends CatalystGroupConverter(updater) with Logging {
 
+  assert(
+    parquetType.getFieldCount == catalystType.length,
+    s"""Field counts of the Parquet schema and the Catalyst schema don't match:
+       |
+       |Parquet schema:
+       |$parquetType
+       |Catalyst schema:
+       |${catalystType.prettyJson}
+     """.stripMargin)
+
   logDebug(
     s"""Building row converter for the following schema:
        |
@@ -179,24 +164,7 @@ private[parquet] class CatalystRowConverter(
 
   // Converters for each field.
   private val fieldConverters: Array[Converter with HasParentContainerUpdater] = {
-    // In case of schema merging, `parquetType` can be a subset of `catalystType`.  We need to pad
-    // those missing fields and create converters for them, although values of these fields are
-    // always null.
-    val paddedParquetFields = {
-      val parquetFields = parquetType.getFields.asScala
-      val parquetFieldNames = parquetFields.map(_.getName).toSet
-      val missingFields = catalystType.filterNot(f => parquetFieldNames.contains(f.name))
-
-      // We don't need to worry about feature flag arguments like `assumeBinaryIsString` when
-      // creating the schema converter here, since values of missing fields are always null.
-      val toParquet = new CatalystSchemaConverter()
-
-      (parquetFields ++ missingFields.map(toParquet.convertField)).sortBy { f =>
-        catalystType.indexWhere(_.name == f.getName)
-      }
-    }
-
-    paddedParquetFields.zip(catalystType).zipWithIndex.map {
+    parquetType.getFields.asScala.zip(catalystType).zipWithIndex.map {
       case ((parquetFieldType, catalystField), ordinal) =>
         // Converted field value should be set to the `ordinal`-th cell of `currentRow`
         newConverter(parquetFieldType, catalystField.dataType, new RowUpdater(currentRow, ordinal))
@@ -334,7 +302,13 @@ private[parquet] class CatalystRowConverter(
     }
 
     override def addBinary(value: Binary): Unit = {
-      updater.set(UTF8String.fromBytes(value.getBytes))
+      // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we
+      // are using `Binary.toByteBuffer.array()` to steal the underlying byte array without copying
+      // it.
+      val buffer = value.toByteBuffer
+      val offset = buffer.position()
+      val numBytes = buffer.limit() - buffer.position()
+      updater.set(UTF8String.fromBytes(buffer.array(), offset, numBytes))
     }
   }
 
@@ -364,24 +338,30 @@ private[parquet] class CatalystRowConverter(
     private def toDecimal(value: Binary): Decimal = {
       val precision = decimalType.precision
       val scale = decimalType.scale
-      val bytes = value.getBytes
 
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
-        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.  The underlying
+        // `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we are using
+        // `Binary.toByteBuffer.array()` to steal the underlying byte array without copying it.
+        val buffer = value.toByteBuffer
+        val bytes = buffer.array()
+        val start = buffer.position()
+        val end = buffer.limit()
+
         var unscaled = 0L
-        var i = 0
+        var i = start
 
-        while (i < bytes.length) {
+        while (i < end) {
           unscaled = (unscaled << 8) | (bytes(i) & 0xff)
           i += 1
         }
 
-        val bits = 8 * bytes.length
+        val bits = 8 * (end - start)
         unscaled = (unscaled << (64 - bits)) >> (64 - bits)
         Decimal(unscaled, precision, scale)
       } else {
         // Otherwise, resorts to an unscaled `BigInteger` instead.
-        Decimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale)
+        Decimal(new BigDecimal(new BigInteger(value.getBytes), scale), precision, scale)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index be6c0545f5a0a..97ffeb08aafe2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -55,16 +55,10 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
  *        to old style non-standard behaviors.
  */
 private[parquet] class CatalystSchemaConverter(
-    private val assumeBinaryIsString: Boolean,
-    private val assumeInt96IsTimestamp: Boolean,
-    private val followParquetFormatSpec: Boolean) {
-
-  // Only used when constructing converter for converting Spark SQL schema to Parquet schema, in
-  // which case `assumeInt96IsTimestamp` and `assumeBinaryIsString` are irrelevant.
-  def this() = this(
-    assumeBinaryIsString = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
-    assumeInt96IsTimestamp = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
-    followParquetFormatSpec = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
+    assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
+    assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
+    followParquetFormatSpec: Boolean = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get
+) {
 
   def this(conf: SQLConf) = this(
     assumeBinaryIsString = conf.isParquetBinaryAsString,
@@ -432,13 +426,14 @@ private[parquet] class CatalystSchemaConverter(
       // ArrayType and MapType (for Spark versions <= 1.4.x)
       // ===================================================
 
-      // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
-      // LIST structure.  This behavior mimics parquet-hive (1.6.0rc3).  Note that this case is
-      // covered by the backwards-compatibility rules implemented in `isElementType()`.
+      // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
+      // `LIST` structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
+      // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
+      // field name "array" is borrowed from parquet-avro.
       case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
         // <list-repetition> group <name> (LIST) {
         //   optional group bag {
-        //     repeated <element-type> element;
+        //     repeated <element-type> array;
         //   }
         // }
         ConversionPatterns.listType(
@@ -447,8 +442,8 @@ private[parquet] class CatalystSchemaConverter(
           Types
             .buildGroup(REPEATED)
             // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
-            .addField(convertField(StructField("array_element", elementType, nullable)))
-            .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
+            .addField(convertField(StructField("array", elementType, nullable)))
+            .named("bag"))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
@@ -572,9 +567,9 @@ private[parquet] object CatalystSchemaConverter {
     }
   }
 
-  val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4)
+  val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) /* 9 */
 
-  val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8)
+  val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8) /* 18 */
 
   // Max precision of a decimal value stored in `numBytes` bytes
   def maxPrecisionForBytes(numBytes: Int): Int = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
index 2c6b914328b60..de1fd0166ac5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
@@ -53,7 +53,11 @@ private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: T
   override def setupTask(taskContext: TaskAttemptContext): Unit = {}
 
   override def commitJob(jobContext: JobContext) {
-    val configuration = ContextUtil.getConfiguration(jobContext)
+    val configuration = {
+      // scalastyle:off jobcontext
+      ContextUtil.getConfiguration(jobContext)
+      // scalastyle:on jobcontext
+    }
     val fileSystem = outputPath.getFileSystem(configuration)
 
     if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 64982f37cf872..953fcab126970 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -40,6 +40,7 @@ import org.apache.parquet.{Log => ApacheParquetLog}
 import org.slf4j.bridge.SLF4JBridgeHandler
 
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{RDD, SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
@@ -81,8 +82,10 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
         //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
         //     partitions in the case of dynamic partitioning.
         override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-          val split = context.getTaskAttemptID.getTaskID.getId
+          val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+          val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+          val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+          val split = taskAttemptId.getTaskID.getId
           new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
         }
       }
@@ -208,7 +211,11 @@ private[sql] class ParquetRelation(
   override def sizeInBytes: Long = metadataCache.dataStatuses.map(_.getLen).sum
 
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
-    val conf = ContextUtil.getConfiguration(job)
+    val conf = {
+      // scalastyle:off jobcontext
+      ContextUtil.getConfiguration(job)
+      // scalastyle:on jobcontext
+    }
 
     // SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible
     val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
@@ -525,7 +532,7 @@ private[sql] object ParquetRelation extends Logging {
       assumeBinaryIsString: Boolean,
       assumeInt96IsTimestamp: Boolean,
       followParquetFormatSpec: Boolean)(job: Job): Unit = {
-    val conf = job.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
 
     // Try to push down filters when filter push-down is enabled.
@@ -569,7 +576,7 @@ private[sql] object ParquetRelation extends Logging {
       FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
     }
 
-    overrideMinSplitSize(parquetBlockSize, job.getConfiguration)
+    overrideMinSplitSize(parquetBlockSize, SparkHadoopUtil.get.getConfigurationFromJobContext(job))
   }
 
   private[parquet] def readSchema(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
index 142301fe87cb6..b647bb6116afa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
@@ -123,7 +123,11 @@ private[parquet] object ParquetTypesConverter extends Logging {
       throw new IllegalArgumentException("Unable to read Parquet metadata: path is null")
     }
     val job = new Job()
-    val conf = configuration.getOrElse(ContextUtil.getConfiguration(job))
+    val conf = {
+      // scalastyle:off jobcontext
+      configuration.getOrElse(ContextUtil.getConfiguration(job))
+      // scalastyle:on jobcontext
+    }
     val fs: FileSystem = origPath.getFileSystem(conf)
     if (fs == null) {
       throw new IllegalArgumentException(s"Incorrectly formatted Parquet metadata path $origPath")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 16c9138419fa2..8efc8016f94dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -37,7 +37,7 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
 
       // We are inserting into an InsertableRelation or HadoopFsRelation.
       case i @ InsertIntoTable(
-      l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation), _, child, _, _) => {
+      l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _), _, child, _, _) => {
         // First, make sure the data to be inserted have the same number of fields with the
         // schema of the relation.
         if (l.output.size != child.output.size) {
@@ -84,14 +84,14 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case i @ logical.InsertIntoTable(
-        l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite, ifNotExists) =>
+        l @ LogicalRelation(t: InsertableRelation, _), partition, query, overwrite, ifNotExists) =>
         // Right now, we do not support insert into a data source table with partition specs.
         if (partition.nonEmpty) {
           failAnalysis(s"Insert into a partition is not allowed because $l is not partitioned.")
         } else {
           // Get all input data source relations of the query.
           val srcRelations = query.collect {
-            case LogicalRelation(src: BaseRelation) => src
+            case LogicalRelation(src: BaseRelation, _) => src
           }
           if (srcRelations.contains(t)) {
             failAnalysis(
@@ -102,7 +102,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
         }
 
       case logical.InsertIntoTable(
-        LogicalRelation(r: HadoopFsRelation), part, query, overwrite, _) =>
+        LogicalRelation(r: HadoopFsRelation, _), part, query, overwrite, _) =>
         // We need to make sure the partition columns specified by users do match partition
         // columns of the relation.
         val existingPartitionColumns = r.partitionColumns.fieldNames.toSet
@@ -120,7 +120,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
 
         // Get all input data source relations of the query.
         val srcRelations = query.collect {
-          case LogicalRelation(src: BaseRelation) => src
+          case LogicalRelation(src: BaseRelation, _) => src
         }
         if (srcRelations.contains(r)) {
           failAnalysis(
@@ -148,10 +148,10 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
           EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
-            case l @ LogicalRelation(dest: BaseRelation) =>
+            case l @ LogicalRelation(dest: BaseRelation, _) =>
               // Get all input data source relations of the query.
               val srcRelations = query.collect {
-                case LogicalRelation(src: BaseRelation) => src
+                case LogicalRelation(src: BaseRelation, _) => src
               }
               if (srcRelations.contains(dest)) {
                 failAnalysis(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 6c0196c21a0d1..bc255b27502b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -25,7 +25,8 @@ import org.apache.spark.shuffle.ShuffleMemoryManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkSqlSerializer
-import org.apache.spark.sql.execution.metric.LongSQLMetric
+import org.apache.spark.sql.execution.local.LocalNode
+import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.map.BytesToBytesMap
 import org.apache.spark.unsafe.memory.{MemoryLocation, ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
@@ -38,7 +39,7 @@ import org.apache.spark.{SparkConf, SparkEnv}
  * Interface for a hashed relation by some key. Use [[HashedRelation.apply]] to create a concrete
  * object.
  */
-private[joins] sealed trait HashedRelation {
+private[execution] sealed trait HashedRelation {
   def get(key: InternalRow): Seq[InternalRow]
 
   // This is a helper method to implement Externalizable, and is used by
@@ -111,7 +112,11 @@ final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalR
 // TODO(rxin): a version of [[HashedRelation]] backed by arrays for consecutive integer keys.
 
 
-private[joins] object HashedRelation {
+private[execution] object HashedRelation {
+
+  def apply(localNode: LocalNode, keyGenerator: Projection): HashedRelation = {
+    apply(localNode.asIterator, SQLMetrics.nullLongMetric, keyGenerator)
+  }
 
   def apply(
       input: Iterator[InternalRow],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 6b7322671d6b4..70a1af6a7063a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -56,9 +56,6 @@ case class SortMergeJoin(
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     requiredOrders(leftKeys) :: requiredOrders(rightKeys) :: Nil
 
-  @transient protected lazy val leftKeyGenerator = newProjection(leftKeys, left.output)
-  @transient protected lazy val rightKeyGenerator = newProjection(rightKeys, right.output)
-
   protected[this] def isUnsafeMode: Boolean = {
     (codegenEnabled && unsafeEnabled
       && UnsafeProjection.canSupport(leftKeys)
@@ -82,6 +79,28 @@ case class SortMergeJoin(
 
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       new RowIterator {
+        // The projection used to extract keys from input rows of the left child.
+        private[this] val leftKeyGenerator = {
+          if (isUnsafeMode) {
+            // It is very important to use UnsafeProjection if input rows are UnsafeRows.
+            // Otherwise, GenerateProjection will cause wrong results.
+            UnsafeProjection.create(leftKeys, left.output)
+          } else {
+            newProjection(leftKeys, left.output)
+          }
+        }
+
+        // The projection used to extract keys from input rows of the right child.
+        private[this] val rightKeyGenerator = {
+          if (isUnsafeMode) {
+            // It is very important to use UnsafeProjection if input rows are UnsafeRows.
+            // Otherwise, GenerateProjection will cause wrong results.
+            UnsafeProjection.create(rightKeys, right.output)
+          } else {
+            newProjection(rightKeys, right.output)
+          }
+        }
+
         // An ordering that can be used to compare keys from both sides.
         private[this] val keyOrdering = newNaturalAscendingOrdering(leftKeys.map(_.dataType))
         private[this] var currentLeftRow: InternalRow = _
@@ -238,7 +257,7 @@ private[joins] class SortMergeJoinScanner(
    * Advances the streamed input iterator and buffers all rows from the buffered input that
    * have matching keys.
    * @return true if the streamed iterator returned a row, false otherwise. If this returns true,
-   *         then [getStreamedRow and [[getBufferedMatches]] can be called to produce the outer
+   *         then [[getStreamedRow]] and [[getBufferedMatches]] can be called to produce the outer
    *         join results.
    */
   final def findNextOuterJoinRows(): Boolean = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
index dea9e5e580a1e..c117dff9c8b1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
@@ -17,20 +17,21 @@
 
 package org.apache.spark.sql.execution.joins
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
+import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
+import org.apache.spark.util.collection.BitSet
 
 /**
  * :: DeveloperApi ::
  * Performs an sort merge outer join of two child relations.
- *
- * Note: this does not support full outer join yet; see SPARK-9730 for progress on this.
  */
 @DeveloperApi
 case class SortMergeOuterJoin(
@@ -52,6 +53,8 @@ case class SortMergeOuterJoin(
         left.output ++ right.output.map(_.withNullability(true))
       case RightOuter =>
         left.output.map(_.withNullability(true)) ++ right.output
+      case FullOuter =>
+        (left.output ++ right.output).map(_.withNullability(true))
       case x =>
         throw new IllegalArgumentException(
           s"${getClass.getSimpleName} should not take $x as the JoinType")
@@ -62,6 +65,7 @@ case class SortMergeOuterJoin(
     // For left and right outer joins, the output is partitioned by the streamed input's join keys.
     case LeftOuter => left.outputPartitioning
     case RightOuter => right.outputPartitioning
+    case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
     case x =>
       throw new IllegalArgumentException(
         s"${getClass.getSimpleName} should not take $x as the JoinType")
@@ -71,6 +75,8 @@ case class SortMergeOuterJoin(
     // For left and right outer joins, the output is ordered by the streamed input's join keys.
     case LeftOuter => requiredOrders(leftKeys)
     case RightOuter => requiredOrders(rightKeys)
+    // there are null rows in both streams, so there is no order
+    case FullOuter => Nil
     case x => throw new IllegalArgumentException(
       s"SortMergeOuterJoin should not take $x as the JoinType")
   }
@@ -165,6 +171,26 @@ case class SortMergeOuterJoin(
           new RightOuterIterator(
             smjScanner, leftNullRow, boundCondition, resultProj, numOutputRows).toScala
 
+        case FullOuter =>
+          val leftNullRow = new GenericInternalRow(left.output.length)
+          val rightNullRow = new GenericInternalRow(right.output.length)
+          val smjScanner = new SortMergeFullOuterJoinScanner(
+            leftKeyGenerator = createLeftKeyGenerator(),
+            rightKeyGenerator = createRightKeyGenerator(),
+            keyOrdering,
+            leftIter = RowIterator.fromScala(leftIter),
+            numLeftRows,
+            rightIter = RowIterator.fromScala(rightIter),
+            numRightRows,
+            boundCondition,
+            leftNullRow,
+            rightNullRow)
+
+          new FullOuterIterator(
+            smjScanner,
+            resultProj,
+            numOutputRows).toScala
+
         case x =>
           throw new IllegalArgumentException(
             s"SortMergeOuterJoin should not take $x as the JoinType")
@@ -173,98 +199,307 @@ case class SortMergeOuterJoin(
   }
 }
 
-
+/**
+ * An iterator for outputting rows in left outer join.
+ */
 private class LeftOuterIterator(
     smjScanner: SortMergeJoinScanner,
     rightNullRow: InternalRow,
     boundCondition: InternalRow => Boolean,
     resultProj: InternalRow => InternalRow,
-    numRows: LongSQLMetric
-  ) extends RowIterator {
-  private[this] val joinedRow: JoinedRow = new JoinedRow()
-  private[this] var rightIdx: Int = 0
+    numOutputRows: LongSQLMetric)
+  extends OneSideOuterIterator(
+    smjScanner, rightNullRow, boundCondition, resultProj, numOutputRows) {
+
+  protected override def setStreamSideOutput(row: InternalRow): Unit = joinedRow.withLeft(row)
+  protected override def setBufferedSideOutput(row: InternalRow): Unit = joinedRow.withRight(row)
+}
+
+/**
+ * An iterator for outputting rows in right outer join.
+ */
+private class RightOuterIterator(
+    smjScanner: SortMergeJoinScanner,
+    leftNullRow: InternalRow,
+    boundCondition: InternalRow => Boolean,
+    resultProj: InternalRow => InternalRow,
+    numOutputRows: LongSQLMetric)
+  extends OneSideOuterIterator(
+    smjScanner, leftNullRow, boundCondition, resultProj, numOutputRows) {
+
+  protected override def setStreamSideOutput(row: InternalRow): Unit = joinedRow.withRight(row)
+  protected override def setBufferedSideOutput(row: InternalRow): Unit = joinedRow.withLeft(row)
+}
+
+/**
+ * An abstract iterator for sharing code between [[LeftOuterIterator]] and [[RightOuterIterator]].
+ *
+ * Each [[OneSideOuterIterator]] has a streamed side and a buffered side. Each row on the
+ * streamed side will output 0 or many rows, one for each matching row on the buffered side.
+ * If there are no matches, then the buffered side of the joined output will be a null row.
+ *
+ * In left outer join, the left is the streamed side and the right is the buffered side.
+ * In right outer join, the right is the streamed side and the left is the buffered side.
+ *
+ * @param smjScanner a scanner that streams rows and buffers any matching rows
+ * @param bufferedSideNullRow the default row to return when a streamed row has no matches
+ * @param boundCondition an additional filter condition for buffered rows
+ * @param resultProj how the output should be projected
+ * @param numOutputRows an accumulator metric for the number of rows output
+ */
+private abstract class OneSideOuterIterator(
+    smjScanner: SortMergeJoinScanner,
+    bufferedSideNullRow: InternalRow,
+    boundCondition: InternalRow => Boolean,
+    resultProj: InternalRow => InternalRow,
+    numOutputRows: LongSQLMetric) extends RowIterator {
+
+  // A row to store the joined result, reused many times
+  protected[this] val joinedRow: JoinedRow = new JoinedRow()
+
+  // Index of the buffered rows, reset to 0 whenever we advance to a new streamed row
+  private[this] var bufferIndex: Int = 0
+
+  // This iterator is initialized lazily so there should be no matches initially
   assert(smjScanner.getBufferedMatches.length == 0)
 
-  private def advanceLeft(): Boolean = {
-    rightIdx = 0
+  // Set output methods to be overridden by subclasses
+  protected def setStreamSideOutput(row: InternalRow): Unit
+  protected def setBufferedSideOutput(row: InternalRow): Unit
+
+  /**
+   * Advance to the next row on the stream side and populate the buffer with matches.
+   * @return whether there are more rows in the stream to consume.
+   */
+  private def advanceStream(): Boolean = {
+    bufferIndex = 0
     if (smjScanner.findNextOuterJoinRows()) {
-      joinedRow.withLeft(smjScanner.getStreamedRow)
+      setStreamSideOutput(smjScanner.getStreamedRow)
       if (smjScanner.getBufferedMatches.isEmpty) {
-        // There are no matching right rows, so return nulls for the right row
-        joinedRow.withRight(rightNullRow)
+        // There are no matching rows in the buffer, so return the null row
+        setBufferedSideOutput(bufferedSideNullRow)
       } else {
-        // Find the next row from the right input that satisfied the bound condition
-        if (!advanceRightUntilBoundConditionSatisfied()) {
-          joinedRow.withRight(rightNullRow)
+        // Find the next row in the buffer that satisfied the bound condition
+        if (!advanceBufferUntilBoundConditionSatisfied()) {
+          setBufferedSideOutput(bufferedSideNullRow)
         }
       }
       true
     } else {
-      // Left input has been exhausted
+      // Stream has been exhausted
       false
     }
   }
 
-  private def advanceRightUntilBoundConditionSatisfied(): Boolean = {
+  /**
+   * Advance to the next row in the buffer that satisfies the bound condition.
+   * @return whether there is such a row in the current buffer.
+   */
+  private def advanceBufferUntilBoundConditionSatisfied(): Boolean = {
     var foundMatch: Boolean = false
-    while (!foundMatch && rightIdx < smjScanner.getBufferedMatches.length) {
-      foundMatch = boundCondition(joinedRow.withRight(smjScanner.getBufferedMatches(rightIdx)))
-      rightIdx += 1
+    while (!foundMatch && bufferIndex < smjScanner.getBufferedMatches.length) {
+      setBufferedSideOutput(smjScanner.getBufferedMatches(bufferIndex))
+      foundMatch = boundCondition(joinedRow)
+      bufferIndex += 1
     }
     foundMatch
   }
 
   override def advanceNext(): Boolean = {
-    val r = advanceRightUntilBoundConditionSatisfied() || advanceLeft()
-    if (r) numRows += 1
+    val r = advanceBufferUntilBoundConditionSatisfied() || advanceStream()
+    if (r) numOutputRows += 1
     r
   }
 
   override def getRow: InternalRow = resultProj(joinedRow)
 }
 
-private class RightOuterIterator(
-    smjScanner: SortMergeJoinScanner,
-    leftNullRow: InternalRow,
+private class SortMergeFullOuterJoinScanner(
+    leftKeyGenerator: Projection,
+    rightKeyGenerator: Projection,
+    keyOrdering: Ordering[InternalRow],
+    leftIter: RowIterator,
+    numLeftRows: LongSQLMetric,
+    rightIter: RowIterator,
+    numRightRows: LongSQLMetric,
     boundCondition: InternalRow => Boolean,
-    resultProj: InternalRow => InternalRow,
-    numRows: LongSQLMetric
-  ) extends RowIterator {
+    leftNullRow: InternalRow,
+    rightNullRow: InternalRow)  {
   private[this] val joinedRow: JoinedRow = new JoinedRow()
-  private[this] var leftIdx: Int = 0
-  assert(smjScanner.getBufferedMatches.length == 0)
+  private[this] var leftRow: InternalRow = _
+  private[this] var leftRowKey: InternalRow = _
+  private[this] var rightRow: InternalRow = _
+  private[this] var rightRowKey: InternalRow = _
 
-  private def advanceRight(): Boolean = {
-    leftIdx = 0
-    if (smjScanner.findNextOuterJoinRows()) {
-      joinedRow.withRight(smjScanner.getStreamedRow)
-      if (smjScanner.getBufferedMatches.isEmpty) {
-        // There are no matching left rows, so return nulls for the left row
-        joinedRow.withLeft(leftNullRow)
-      } else {
-        // Find the next row from the left input that satisfied the bound condition
-        if (!advanceLeftUntilBoundConditionSatisfied()) {
-          joinedRow.withLeft(leftNullRow)
-        }
-      }
+  private[this] var leftIndex: Int = 0
+  private[this] var rightIndex: Int = 0
+  private[this] val leftMatches: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow]
+  private[this] val rightMatches: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow]
+  private[this] var leftMatched: BitSet = new BitSet(1)
+  private[this] var rightMatched: BitSet = new BitSet(1)
+
+  advancedLeft()
+  advancedRight()
+
+  // --- Private methods --------------------------------------------------------------------------
+
+  /**
+   * Advance the left iterator and compute the new row's join key.
+   * @return true if the left iterator returned a row and false otherwise.
+   */
+  private def advancedLeft(): Boolean = {
+    if (leftIter.advanceNext()) {
+      leftRow = leftIter.getRow
+      leftRowKey = leftKeyGenerator(leftRow)
+      numLeftRows += 1
       true
     } else {
-      // Right input has been exhausted
+      leftRow = null
+      leftRowKey = null
       false
     }
   }
 
-  private def advanceLeftUntilBoundConditionSatisfied(): Boolean = {
-    var foundMatch: Boolean = false
-    while (!foundMatch && leftIdx < smjScanner.getBufferedMatches.length) {
-      foundMatch = boundCondition(joinedRow.withLeft(smjScanner.getBufferedMatches(leftIdx)))
-      leftIdx += 1
+  /**
+   * Advance the right iterator and compute the new row's join key.
+   * @return true if the right iterator returned a row and false otherwise.
+   */
+  private def advancedRight(): Boolean = {
+    if (rightIter.advanceNext()) {
+      rightRow = rightIter.getRow
+      rightRowKey = rightKeyGenerator(rightRow)
+      numRightRows += 1
+      true
+    } else {
+      rightRow = null
+      rightRowKey = null
+      false
+    }
+  }
+
+  /**
+   * Populate the left and right buffers with rows matching the provided key.
+   * This consumes rows from both iterators until their keys are different from the matching key.
+   */
+  private def findMatchingRows(matchingKey: InternalRow): Unit = {
+    leftMatches.clear()
+    rightMatches.clear()
+    leftIndex = 0
+    rightIndex = 0
+
+    while (leftRowKey != null && keyOrdering.compare(leftRowKey, matchingKey) == 0) {
+      leftMatches += leftRow.copy()
+      advancedLeft()
+    }
+    while (rightRowKey != null && keyOrdering.compare(rightRowKey, matchingKey) == 0) {
+      rightMatches += rightRow.copy()
+      advancedRight()
+    }
+
+    if (leftMatches.size <= leftMatched.capacity) {
+      leftMatched.clear()
+    } else {
+      leftMatched = new BitSet(leftMatches.size)
+    }
+    if (rightMatches.size <= rightMatched.capacity) {
+      rightMatched.clear()
+    } else {
+      rightMatched = new BitSet(rightMatches.size)
+    }
+  }
+
+  /**
+   * Scan the left and right buffers for the next valid match.
+   *
+   * Note: this method mutates `joinedRow` to point to the latest matching rows in the buffers.
+   * If a left row has no valid matches on the right, or a right row has no valid matches on the
+   * left, then the row is joined with the null row and the result is considered a valid match.
+   *
+   * @return true if a valid match is found, false otherwise.
+   */
+  private def scanNextInBuffered(): Boolean = {
+    while (leftIndex < leftMatches.size) {
+      while (rightIndex < rightMatches.size) {
+        joinedRow(leftMatches(leftIndex), rightMatches(rightIndex))
+        if (boundCondition(joinedRow)) {
+          leftMatched.set(leftIndex)
+          rightMatched.set(rightIndex)
+          rightIndex += 1
+          return true
+        }
+        rightIndex += 1
+      }
+      rightIndex = 0
+      if (!leftMatched.get(leftIndex)) {
+        // the left row has never matched any right row, join it with null row
+        joinedRow(leftMatches(leftIndex), rightNullRow)
+        leftIndex += 1
+        return true
+      }
+      leftIndex += 1
+    }
+
+    while (rightIndex < rightMatches.size) {
+      if (!rightMatched.get(rightIndex)) {
+        // the right row has never matched any left row, join it with null row
+        joinedRow(leftNullRow, rightMatches(rightIndex))
+        rightIndex += 1
+        return true
+      }
+      rightIndex += 1
+    }
+
+    // There are no more valid matches in the left and right buffers
+    false
+  }
+
+  // --- Public methods --------------------------------------------------------------------------
+
+  def getJoinedRow(): JoinedRow = joinedRow
+
+  def advanceNext(): Boolean = {
+    // If we already buffered some matching rows, use them directly
+    if (leftIndex <= leftMatches.size || rightIndex <= rightMatches.size) {
+      if (scanNextInBuffered()) {
+        return true
+      }
+    }
+
+    if (leftRow != null && (leftRowKey.anyNull || rightRow == null)) {
+      joinedRow(leftRow.copy(), rightNullRow)
+      advancedLeft()
+      true
+    } else if (rightRow != null && (rightRowKey.anyNull || leftRow == null)) {
+      joinedRow(leftNullRow, rightRow.copy())
+      advancedRight()
+      true
+    } else if (leftRow != null && rightRow != null) {
+      // Both rows are present and neither have null values,
+      // so we populate the buffers with rows matching the next key
+      val comp = keyOrdering.compare(leftRowKey, rightRowKey)
+      if (comp <= 0) {
+        findMatchingRows(leftRowKey.copy())
+      } else {
+        findMatchingRows(rightRowKey.copy())
+      }
+      scanNextInBuffered()
+      true
+    } else {
+      // Both iterators have been consumed
+      false
     }
-    foundMatch
   }
+}
+
+private class FullOuterIterator(
+    smjScanner: SortMergeFullOuterJoinScanner,
+    resultProj: InternalRow => InternalRow,
+    numRows: LongSQLMetric
+  ) extends RowIterator {
+  private[this] val joinedRow: JoinedRow = smjScanner.getJoinedRow()
 
   override def advanceNext(): Boolean = {
-    val r = advanceLeftUntilBoundConditionSatisfied() || advanceRight()
+    val r = smjScanner.advanceNext()
     if (r) numRows += 1
     r
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala
new file mode 100644
index 0000000000000..b31c5a863832e
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToSafeNode.scala
@@ -0,0 +1,40 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, FromUnsafeProjection, Projection}
+
+case class ConvertToSafeNode(conf: SQLConf, child: LocalNode) extends UnaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = child.output
+
+  private[this] var convertToSafe: Projection = _
+
+  override def open(): Unit = {
+    child.open()
+    convertToSafe = FromUnsafeProjection(child.schema)
+  }
+
+  override def next(): Boolean = child.next()
+
+  override def fetch(): InternalRow = convertToSafe(child.fetch())
+
+  override def close(): Unit = child.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala
new file mode 100644
index 0000000000000..de2f4e661ab44
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ConvertToUnsafeNode.scala
@@ -0,0 +1,40 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Projection, UnsafeProjection}
+
+case class ConvertToUnsafeNode(conf: SQLConf, child: LocalNode) extends UnaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = child.output
+
+  private[this] var convertToUnsafe: Projection = _
+
+  override def open(): Unit = {
+    child.open()
+    convertToUnsafe = UnsafeProjection.create(child.schema)
+  }
+
+  override def next(): Boolean = child.next()
+
+  override def fetch(): InternalRow = convertToUnsafe(child.fetch())
+
+  override def close(): Unit = child.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala
new file mode 100644
index 0000000000000..2aff156d18b54
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ExpandNode.scala
@@ -0,0 +1,60 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection}
+
+case class ExpandNode(
+    conf: SQLConf,
+    projections: Seq[Seq[Expression]],
+    output: Seq[Attribute],
+    child: LocalNode) extends UnaryLocalNode(conf) {
+
+  assert(projections.size > 0)
+
+  private[this] var result: InternalRow = _
+  private[this] var idx: Int = _
+  private[this] var input: InternalRow = _
+  private[this] var groups: Array[Projection] = _
+
+  override def open(): Unit = {
+    child.open()
+    groups = projections.map(ee => newProjection(ee, child.output)).toArray
+    idx = groups.length
+  }
+
+  override def next(): Boolean = {
+    if (idx >= groups.length) {
+      if (child.next()) {
+        input = child.fetch()
+        idx = 0
+      } else {
+        return false
+      }
+    }
+    result = groups(idx)(input)
+    idx += 1
+    true
+  }
+
+  override def fetch(): InternalRow = result
+
+  override def close(): Unit = child.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
index a485a1a1d7ae4..dd1113b6726cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/FilterNode.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 
 
-case class FilterNode(condition: Expression, child: LocalNode) extends UnaryLocalNode {
+case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode)
+  extends UnaryLocalNode(conf) {
 
   private[this] var predicate: (InternalRow) => Boolean = _
 
@@ -35,13 +37,13 @@ case class FilterNode(condition: Expression, child: LocalNode) extends UnaryLoca
 
   override def next(): Boolean = {
     var found = false
-    while (child.next() && !found) {
-      found = predicate.apply(child.get())
+    while (!found && child.next()) {
+      found = predicate.apply(child.fetch())
     }
     found
   }
 
-  override def get(): InternalRow = child.get()
+  override def fetch(): InternalRow = child.fetch()
 
   override def close(): Unit = child.close()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
new file mode 100644
index 0000000000000..e7b24e3fca2b4
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/HashJoinNode.scala
@@ -0,0 +1,136 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.joins._
+import org.apache.spark.sql.execution.metric.SQLMetrics
+
+/**
+ * Much of this code is similar to [[org.apache.spark.sql.execution.joins.HashJoin]].
+ */
+case class HashJoinNode(
+    conf: SQLConf,
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    buildSide: BuildSide,
+    left: LocalNode,
+    right: LocalNode) extends BinaryLocalNode(conf) {
+
+  private[this] lazy val (buildNode, buildKeys, streamedNode, streamedKeys) = buildSide match {
+    case BuildLeft => (left, leftKeys, right, rightKeys)
+    case BuildRight => (right, rightKeys, left, leftKeys)
+  }
+
+  private[this] var currentStreamedRow: InternalRow = _
+  private[this] var currentHashMatches: Seq[InternalRow] = _
+  private[this] var currentMatchPosition: Int = -1
+
+  private[this] var joinRow: JoinedRow = _
+  private[this] var resultProjection: (InternalRow) => InternalRow = _
+
+  private[this] var hashed: HashedRelation = _
+  private[this] var joinKeys: Projection = _
+
+  override def output: Seq[Attribute] = left.output ++ right.output
+
+  private[this] def isUnsafeMode: Boolean = {
+    (codegenEnabled && unsafeEnabled
+      && UnsafeProjection.canSupport(buildKeys)
+      && UnsafeProjection.canSupport(schema))
+  }
+
+  private[this] def buildSideKeyGenerator: Projection = {
+    if (isUnsafeMode) {
+      UnsafeProjection.create(buildKeys, buildNode.output)
+    } else {
+      newMutableProjection(buildKeys, buildNode.output)()
+    }
+  }
+
+  private[this] def streamSideKeyGenerator: Projection = {
+    if (isUnsafeMode) {
+      UnsafeProjection.create(streamedKeys, streamedNode.output)
+    } else {
+      newMutableProjection(streamedKeys, streamedNode.output)()
+    }
+  }
+
+  override def open(): Unit = {
+    buildNode.open()
+    hashed = HashedRelation(buildNode, buildSideKeyGenerator)
+    streamedNode.open()
+    joinRow = new JoinedRow
+    resultProjection = {
+      if (isUnsafeMode) {
+        UnsafeProjection.create(schema)
+      } else {
+        identity[InternalRow]
+      }
+    }
+    joinKeys = streamSideKeyGenerator
+  }
+
+  override def next(): Boolean = {
+    currentMatchPosition += 1
+    if (currentHashMatches == null || currentMatchPosition >= currentHashMatches.size) {
+      fetchNextMatch()
+    } else {
+      true
+    }
+  }
+
+  /**
+   * Populate `currentHashMatches` with build-side rows matching the next streamed row.
+   * @return whether matches are found such that subsequent calls to `fetch` are valid.
+   */
+  private def fetchNextMatch(): Boolean = {
+    currentHashMatches = null
+    currentMatchPosition = -1
+
+    while (currentHashMatches == null && streamedNode.next()) {
+      currentStreamedRow = streamedNode.fetch()
+      val key = joinKeys(currentStreamedRow)
+      if (!key.anyNull) {
+        currentHashMatches = hashed.get(key)
+      }
+    }
+
+    if (currentHashMatches == null) {
+      false
+    } else {
+      currentMatchPosition = 0
+      true
+    }
+  }
+
+  override def fetch(): InternalRow = {
+    val ret = buildSide match {
+      case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
+      case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
+    }
+    resultProjection(ret)
+  }
+
+  override def close(): Unit = {
+    left.close()
+    right.close()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala
new file mode 100644
index 0000000000000..740d485f8d9e6
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/IntersectNode.scala
@@ -0,0 +1,63 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class IntersectNode(conf: SQLConf, left: LocalNode, right: LocalNode)
+  extends BinaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = left.output
+
+  private[this] var leftRows: mutable.HashSet[InternalRow] = _
+
+  private[this] var currentRow: InternalRow = _
+
+  override def open(): Unit = {
+    left.open()
+    leftRows = mutable.HashSet[InternalRow]()
+    while (left.next()) {
+      leftRows += left.fetch().copy()
+    }
+    left.close()
+    right.open()
+  }
+
+  override def next(): Boolean = {
+    currentRow = null
+    while (currentRow == null && right.next()) {
+      currentRow = right.fetch()
+      if (!leftRows.contains(currentRow)) {
+        currentRow = null
+      }
+    }
+    currentRow != null
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = {
+    left.close()
+    right.close()
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
new file mode 100644
index 0000000000000..401b10a5ed307
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LimitNode.scala
@@ -0,0 +1,46 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+
+case class LimitNode(conf: SQLConf, limit: Int, child: LocalNode) extends UnaryLocalNode(conf) {
+
+  private[this] var count = 0
+
+  override def output: Seq[Attribute] = child.output
+
+  override def open(): Unit = child.open()
+
+  override def close(): Unit = child.close()
+
+  override def fetch(): InternalRow = child.fetch()
+
+  override def next(): Boolean = {
+    if (count < limit) {
+      count += 1
+      child.next()
+    } else {
+      false
+    }
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
index 341c81438e6d6..f96b62a67a254 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/LocalNode.scala
@@ -17,10 +17,14 @@
 
 package org.apache.spark.sql.execution.local
 
-import org.apache.spark.sql.Row
+import scala.util.control.NonFatal
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.{SQLConf, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -29,9 +33,21 @@ import org.apache.spark.sql.types.StructType
  * Before consuming the iterator, open function must be called.
  * After consuming the iterator, close function must be called.
  */
-abstract class LocalNode extends TreeNode[LocalNode] {
+abstract class LocalNode(conf: SQLConf) extends QueryPlan[LocalNode] with Logging {
+
+  protected val codegenEnabled: Boolean = conf.codegenEnabled
+
+  protected val unsafeEnabled: Boolean = conf.unsafeEnabled
 
-  def output: Seq[Attribute]
+  private[this] lazy val isTesting: Boolean = sys.props.contains("spark.testing")
+
+  /**
+   * Called before open(). Prepare can be used to reserve memory needed. It must NOT consume
+   * any input data.
+   *
+   * Implementations of this must also call the `prepare()` function of its children.
+   */
+  def prepare(): Unit = children.foreach(_.prepare())
 
   /**
    * Initializes the iterator state. Must be called before calling `next()`.
@@ -48,39 +64,161 @@ abstract class LocalNode extends TreeNode[LocalNode] {
   /**
    * Returns the current tuple.
    */
-  def get(): InternalRow
+  def fetch(): InternalRow
 
   /**
-   * Closes the iterator and releases all resources.
+   * Closes the iterator and releases all resources. It should be idempotent.
    *
    * Implementations of this must also call the `close()` function of its children.
    */
   def close(): Unit
 
+  /** Specifies whether this operator outputs UnsafeRows */
+  def outputsUnsafeRows: Boolean = false
+
+  /** Specifies whether this operator is capable of processing UnsafeRows */
+  def canProcessUnsafeRows: Boolean = false
+
+  /**
+   * Specifies whether this operator is capable of processing Java-object-based Rows (i.e. rows
+   * that are not UnsafeRows).
+   */
+  def canProcessSafeRows: Boolean = true
+
+  /**
+   * Returns the content through the [[Iterator]] interface.
+   */
+  final def asIterator: Iterator[InternalRow] = new LocalNodeIterator(this)
+
   /**
    * Returns the content of the iterator from the beginning to the end in the form of a Scala Seq.
    */
-  def collect(): Seq[Row] = {
+  final def collect(): Seq[Row] = {
     val converter = CatalystTypeConverters.createToScalaConverter(StructType.fromAttributes(output))
     val result = new scala.collection.mutable.ArrayBuffer[Row]
     open()
-    while (next()) {
-      result += converter.apply(get()).asInstanceOf[Row]
+    try {
+      while (next()) {
+        result += converter.apply(fetch()).asInstanceOf[Row]
+      }
+    } finally {
+      close()
     }
-    close()
     result
   }
+
+  protected def newProjection(
+      expressions: Seq[Expression],
+      inputSchema: Seq[Attribute]): Projection = {
+    log.debug(
+      s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
+    if (codegenEnabled) {
+      try {
+        GenerateProjection.generate(expressions, inputSchema)
+      } catch {
+        case NonFatal(e) =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate projection, fallback to interpret", e)
+            new InterpretedProjection(expressions, inputSchema)
+          }
+      }
+    } else {
+      new InterpretedProjection(expressions, inputSchema)
+    }
+  }
+
+  protected def newMutableProjection(
+      expressions: Seq[Expression],
+      inputSchema: Seq[Attribute]): () => MutableProjection = {
+    log.debug(
+      s"Creating MutableProj: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
+    if (codegenEnabled) {
+      try {
+        GenerateMutableProjection.generate(expressions, inputSchema)
+      } catch {
+        case NonFatal(e) =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate mutable projection, fallback to interpreted", e)
+            () => new InterpretedMutableProjection(expressions, inputSchema)
+          }
+      }
+    } else {
+      () => new InterpretedMutableProjection(expressions, inputSchema)
+    }
+  }
+
+  protected def newPredicate(
+      expression: Expression,
+      inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
+    if (codegenEnabled) {
+      try {
+        GeneratePredicate.generate(expression, inputSchema)
+      } catch {
+        case NonFatal(e) =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate predicate, fallback to interpreted", e)
+            InterpretedPredicate.create(expression, inputSchema)
+          }
+      }
+    } else {
+      InterpretedPredicate.create(expression, inputSchema)
+    }
+  }
 }
 
 
-abstract class LeafLocalNode extends LocalNode {
+abstract class LeafLocalNode(conf: SQLConf) extends LocalNode(conf) {
   override def children: Seq[LocalNode] = Seq.empty
 }
 
 
-abstract class UnaryLocalNode extends LocalNode {
+abstract class UnaryLocalNode(conf: SQLConf) extends LocalNode(conf) {
 
   def child: LocalNode
 
   override def children: Seq[LocalNode] = Seq(child)
 }
+
+abstract class BinaryLocalNode(conf: SQLConf) extends LocalNode(conf) {
+
+  def left: LocalNode
+
+  def right: LocalNode
+
+  override def children: Seq[LocalNode] = Seq(left, right)
+}
+
+/**
+ * An thin wrapper around a [[LocalNode]] that provides an `Iterator` interface.
+ */
+private class LocalNodeIterator(localNode: LocalNode) extends Iterator[InternalRow] {
+  private var nextRow: InternalRow = _
+
+  override def hasNext: Boolean = {
+    if (nextRow == null) {
+      val res = localNode.next()
+      if (res) {
+        nextRow = localNode.fetch()
+      }
+      res
+    } else {
+      true
+    }
+  }
+
+  override def next(): InternalRow = {
+    if (hasNext) {
+      val res = nextRow
+      nextRow = null
+      res
+    } else {
+      throw new NoSuchElementException
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala
new file mode 100644
index 0000000000000..7321fc66b4dde
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNode.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.{FullOuter, RightOuter, LeftOuter, JoinType}
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
+import org.apache.spark.util.collection.{BitSet, CompactBuffer}
+
+case class NestedLoopJoinNode(
+    conf: SQLConf,
+    left: LocalNode,
+    right: LocalNode,
+    buildSide: BuildSide,
+    joinType: JoinType,
+    condition: Option[Expression]) extends BinaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = {
+    joinType match {
+      case LeftOuter =>
+        left.output ++ right.output.map(_.withNullability(true))
+      case RightOuter =>
+        left.output.map(_.withNullability(true)) ++ right.output
+      case FullOuter =>
+        left.output.map(_.withNullability(true)) ++ right.output.map(_.withNullability(true))
+      case x =>
+        throw new IllegalArgumentException(
+          s"NestedLoopJoin should not take $x as the JoinType")
+    }
+  }
+
+  private[this] def genResultProjection: InternalRow => InternalRow = {
+    if (outputsUnsafeRows) {
+      UnsafeProjection.create(schema)
+    } else {
+      identity[InternalRow]
+    }
+  }
+
+  private[this] var currentRow: InternalRow = _
+
+  private[this] var iterator: Iterator[InternalRow] = _
+
+  override def open(): Unit = {
+    val (streamed, build) = buildSide match {
+      case BuildRight => (left, right)
+      case BuildLeft => (right, left)
+    }
+    build.open()
+    val buildRelation = new CompactBuffer[InternalRow]
+    while (build.next()) {
+      buildRelation += build.fetch().copy()
+    }
+    build.close()
+
+    val boundCondition =
+      newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
+
+    val leftNulls = new GenericMutableRow(left.output.size)
+    val rightNulls = new GenericMutableRow(right.output.size)
+    val joinedRow = new JoinedRow
+    val matchedBuildTuples = new BitSet(buildRelation.size)
+    val resultProj = genResultProjection
+    streamed.open()
+
+    // streamedRowMatches also contains null rows if using outer join
+    val streamedRowMatches: Iterator[InternalRow] = streamed.asIterator.flatMap { streamedRow =>
+      val matchedRows = new CompactBuffer[InternalRow]
+
+      var i = 0
+      var streamRowMatched = false
+
+      // Scan the build relation to look for matches for each streamed row
+      while (i < buildRelation.size) {
+        val buildRow = buildRelation(i)
+        buildSide match {
+          case BuildRight => joinedRow(streamedRow, buildRow)
+          case BuildLeft => joinedRow(buildRow, streamedRow)
+        }
+        if (boundCondition(joinedRow)) {
+          matchedRows += resultProj(joinedRow).copy()
+          streamRowMatched = true
+          matchedBuildTuples.set(i)
+        }
+        i += 1
+      }
+
+      // If this row had no matches and we're using outer join, join it with the null rows
+      if (!streamRowMatched) {
+        (joinType, buildSide) match {
+          case (LeftOuter | FullOuter, BuildRight) =>
+            matchedRows += resultProj(joinedRow(streamedRow, rightNulls)).copy()
+          case (RightOuter | FullOuter, BuildLeft) =>
+            matchedRows += resultProj(joinedRow(leftNulls, streamedRow)).copy()
+          case _ =>
+        }
+      }
+
+      matchedRows.iterator
+    }
+
+    // If we're using outer join, find rows on the build side that didn't match anything
+    // and join them with the null row
+    lazy val unmatchedBuildRows: Iterator[InternalRow] = {
+      var i = 0
+      buildRelation.filter { row =>
+        val r = !matchedBuildTuples.get(i)
+        i += 1
+        r
+      }.iterator
+    }
+    iterator = (joinType, buildSide) match {
+      case (RightOuter | FullOuter, BuildRight) =>
+        streamedRowMatches ++
+          unmatchedBuildRows.map { buildRow => resultProj(joinedRow(leftNulls, buildRow)) }
+      case (LeftOuter | FullOuter, BuildLeft) =>
+        streamedRowMatches ++
+          unmatchedBuildRows.map { buildRow => resultProj(joinedRow(buildRow, rightNulls)) }
+      case _ => streamedRowMatches
+    }
+  }
+
+  override def next(): Boolean = {
+    if (iterator.hasNext) {
+      currentRow = iterator.next()
+      true
+    } else {
+      false
+    }
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = {
+    left.close()
+    right.close()
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
index e574d1473cdcb..11529d6dd9b83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/ProjectNode.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, Attribute, NamedExpression}
 
 
-case class ProjectNode(projectList: Seq[NamedExpression], child: LocalNode) extends UnaryLocalNode {
+case class ProjectNode(conf: SQLConf, projectList: Seq[NamedExpression], child: LocalNode)
+  extends UnaryLocalNode(conf) {
 
   private[this] var project: UnsafeProjection = _
 
@@ -34,8 +36,8 @@ case class ProjectNode(projectList: Seq[NamedExpression], child: LocalNode) exte
 
   override def next(): Boolean = child.next()
 
-  override def get(): InternalRow = {
-    project.apply(child.get())
+  override def fetch(): InternalRow = {
+    project.apply(child.fetch())
   }
 
   override def close(): Unit = child.close()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
new file mode 100644
index 0000000000000..793700803f216
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SampleNode.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
+
+
+/**
+ * Sample the dataset.
+ *
+ * @param conf the SQLConf
+ * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
+ * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
+ *                   will be ub - lb.
+ * @param withReplacement Whether to sample with replacement.
+ * @param seed the random seed
+ * @param child the LocalNode
+ */
+case class SampleNode(
+    conf: SQLConf,
+    lowerBound: Double,
+    upperBound: Double,
+    withReplacement: Boolean,
+    seed: Long,
+    child: LocalNode) extends UnaryLocalNode(conf) {
+
+  override def output: Seq[Attribute] = child.output
+
+  private[this] var iterator: Iterator[InternalRow] = _
+
+  private[this] var currentRow: InternalRow = _
+
+  override def open(): Unit = {
+    child.open()
+    val sampler =
+      if (withReplacement) {
+        // Disable gap sampling since the gap sampling method buffers two rows internally,
+        // requiring us to copy the row, which is more expensive than the random number generator.
+        new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false)
+      } else {
+        new BernoulliCellSampler[InternalRow](lowerBound, upperBound)
+      }
+    sampler.setSeed(seed)
+    iterator = sampler.sample(child.asIterator)
+  }
+
+  override def next(): Boolean = {
+    if (iterator.hasNext) {
+      currentRow = iterator.next()
+      true
+    } else {
+      false
+    }
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = child.close()
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
index 994de8afa9a02..b8467f6ae58e0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/SeqScanNode.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark.sql.execution.local
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 /**
  * An operator that scans some local data collection in the form of Scala Seq.
  */
-case class SeqScanNode(output: Seq[Attribute], data: Seq[InternalRow]) extends LeafLocalNode {
+case class SeqScanNode(conf: SQLConf, output: Seq[Attribute], data: Seq[InternalRow])
+  extends LeafLocalNode(conf) {
 
   private[this] var iterator: Iterator[InternalRow] = _
   private[this] var currentRow: InternalRow = _
@@ -41,7 +43,7 @@ case class SeqScanNode(output: Seq[Attribute], data: Seq[InternalRow]) extends L
     }
   }
 
-  override def get(): InternalRow = currentRow
+  override def fetch(): InternalRow = currentRow
 
   override def close(): Unit = {
     // Do nothing
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
new file mode 100644
index 0000000000000..ae672fbca8d83
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNode.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.util.BoundedPriorityQueue
+
+case class TakeOrderedAndProjectNode(
+    conf: SQLConf,
+    limit: Int,
+    sortOrder: Seq[SortOrder],
+    projectList: Option[Seq[NamedExpression]],
+    child: LocalNode) extends UnaryLocalNode(conf) {
+
+  private[this] var projection: Option[Projection] = _
+  private[this] var ord: InterpretedOrdering = _
+  private[this] var iterator: Iterator[InternalRow] = _
+  private[this] var currentRow: InternalRow = _
+
+  override def output: Seq[Attribute] = {
+    val projectOutput = projectList.map(_.map(_.toAttribute))
+    projectOutput.getOrElse(child.output)
+  }
+
+  override def open(): Unit = {
+    child.open()
+    projection = projectList.map(new InterpretedProjection(_, child.output))
+    ord = new InterpretedOrdering(sortOrder, child.output)
+    // Priority keeps the largest elements, so let's reverse the ordering.
+    val queue = new BoundedPriorityQueue[InternalRow](limit)(ord.reverse)
+    while (child.next()) {
+      queue += child.fetch()
+    }
+    // Close it eagerly since we don't need it.
+    child.close()
+    iterator = queue.toArray.sorted(ord).iterator
+  }
+
+  override def next(): Boolean = {
+    if (iterator.hasNext) {
+      val _currentRow = iterator.next()
+      currentRow = projection match {
+        case Some(p) => p(_currentRow)
+        case None => _currentRow
+      }
+      true
+    } else {
+      false
+    }
+  }
+
+  override def fetch(): InternalRow = currentRow
+
+  override def close(): Unit = child.close()
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
new file mode 100644
index 0000000000000..0f2b8303e7372
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/local/UnionNode.scala
@@ -0,0 +1,73 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class UnionNode(conf: SQLConf, children: Seq[LocalNode]) extends LocalNode(conf) {
+
+  override def output: Seq[Attribute] = children.head.output
+
+  private[this] var currentChild: LocalNode = _
+
+  private[this] var nextChildIndex: Int = _
+
+  override def open(): Unit = {
+    currentChild = children.head
+    currentChild.open()
+    nextChildIndex = 1
+  }
+
+  private def advanceToNextChild(): Boolean = {
+    var found = false
+    var exit = false
+    while (!exit && !found) {
+      if (currentChild != null) {
+        currentChild.close()
+      }
+      if (nextChildIndex >= children.size) {
+        found = false
+        exit = true
+      } else {
+        currentChild = children(nextChildIndex)
+        nextChildIndex += 1
+        currentChild.open()
+        found = currentChild.next()
+      }
+    }
+    found
+  }
+
+  override def close(): Unit = {
+    if (currentChild != null) {
+      currentChild.close()
+    }
+  }
+
+  override def fetch(): InternalRow = currentChild.fetch()
+
+  override def next(): Boolean = {
+    if (currentChild.next()) {
+      true
+    } else {
+      advanceToNextChild()
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
similarity index 82%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
index 5a58d846ad80b..5dbe0fc5f95c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
@@ -25,17 +25,18 @@ import scala.collection.JavaConverters._
 import net.razorvine.pickle._
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.api.python.{PythonBroadcast, PythonRDD, SerDeUtil}
+import org.apache.spark.api.python.{PythonRunner, PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.{Accumulator, Logging => SparkLogging}
+import org.apache.spark.{Logging => SparkLogging, TaskContext, Accumulator}
 
 /**
  * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
@@ -118,6 +119,15 @@ object EvaluatePython {
   def apply(udf: PythonUDF, child: LogicalPlan): EvaluatePython =
     new EvaluatePython(udf, child, AttributeReference("pythonUDF", udf.dataType)())
 
+  def takeAndServe(df: DataFrame, n: Int): Int = {
+    registerPicklers()
+    val iter = new SerDeUtil.AutoBatchedPickler(
+      df.queryExecution.executedPlan.executeTake(n).iterator.map { row =>
+        EvaluatePython.toJava(row, df.schema)
+      })
+    PythonRDD.serveIterator(iter, s"serve-DataFrame")
+  }
+
   /**
    * Helper for converting from Catalyst type to java type suitable for Pyrolite.
    */
@@ -329,7 +339,13 @@ case class EvaluatePython(
 /**
  * :: DeveloperApi ::
  * Uses PythonRDD to evaluate a [[PythonUDF]], one partition of tuples at a time.
- * The input data is zipped with the result of the udf evaluation.
+ *
+ * Python evaluation works by sending the necessary (projected) input data via a socket to an
+ * external Python process, and combine the result from the Python process with the original row.
+ *
+ * For each row we send to Python, we also put it in a queue. For each output row from Python,
+ * we drain the queue to find the original input row. Note that if the Python process is way too
+ * slow, this could lead to the queue growing unbounded and eventually run out of memory.
  */
 @DeveloperApi
 case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child: SparkPlan)
@@ -337,52 +353,62 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
 
   def children: Seq[SparkPlan] = child :: Nil
 
+  override def outputsUnsafeRows: Boolean = false
+  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessSafeRows: Boolean = true
+
   protected override def doExecute(): RDD[InternalRow] = {
-    val childResults = child.execute().map(_.copy())
+    val inputRDD = child.execute().map(_.copy())
+    val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
+    val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
 
-    val parent = childResults.mapPartitions { iter =>
+    inputRDD.mapPartitions { iter =>
       EvaluatePython.registerPicklers()  // register pickler for Row
+
+      // The queue used to buffer input rows so we can drain it to
+      // combine input with output from Python.
+      val queue = new java.util.concurrent.ConcurrentLinkedQueue[InternalRow]()
+
       val pickle = new Pickler
       val currentRow = newMutableProjection(udf.children, child.output)()
       val fields = udf.children.map(_.dataType)
       val schema = new StructType(fields.map(t => new StructField("", t, true)).toArray)
-      iter.grouped(100).map { inputRows =>
+
+      // Input iterator to Python: input rows are grouped so we send them in batches to Python.
+      // For each row, add it to the queue.
+      val inputIterator = iter.grouped(100).map { inputRows =>
         val toBePickled = inputRows.map { row =>
+          queue.add(row)
           EvaluatePython.toJava(currentRow(row), schema)
         }.toArray
         pickle.dumps(toBePickled)
       }
-    }
 
-    val pyRDD = new PythonRDD(
-      parent,
-      udf.command,
-      udf.envVars,
-      udf.pythonIncludes,
-      false,
-      udf.pythonExec,
-      udf.pythonVer,
-      udf.broadcastVars,
-      udf.accumulator
-    ).mapPartitions { iter =>
-      val pickle = new Unpickler
-      iter.flatMap { pickedResult =>
-        val unpickledBatch = pickle.loads(pickedResult)
-        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
-      }
-    }.mapPartitions { iter =>
+      val context = TaskContext.get()
+
+      // Output iterator for results from Python.
+      val outputIterator = new PythonRunner(
+        udf.command,
+        udf.envVars,
+        udf.pythonIncludes,
+        udf.pythonExec,
+        udf.pythonVer,
+        udf.broadcastVars,
+        udf.accumulator,
+        bufferSize,
+        reuseWorker
+      ).compute(inputIterator, context.partitionId(), context)
+
+      val unpickle = new Unpickler
       val row = new GenericMutableRow(1)
-      iter.map { result =>
-        row(0) = EvaluatePython.fromJava(result, udf.dataType)
-        row: InternalRow
-      }
-    }
+      val joined = new JoinedRow
 
-    childResults.zip(pyRDD).mapPartitions { iter =>
-      val joinedRow = new JoinedRow()
-      iter.map {
-        case (row, udfResult) =>
-          joinedRow(row, udfResult)
+      outputIterator.flatMap { pickedResult =>
+        val unpickledBatch = unpickle.loads(pickedResult)
+        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
+      }.map { result =>
+        row(0) = EvaluatePython.fromJava(result, udf.dataType)
+        joined(queue.poll(), row)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index 40ef7c3b53530..27f26245a5ef0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -31,38 +31,12 @@ import org.apache.spark.{SparkEnv, InternalAccumulator, TaskContext}
 // This file defines various sort operators.
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-/**
- * Performs a sort on-heap.
- * @param global when true performs a global sort of all partitions by shuffling the data first
- *               if necessary.
- */
-case class Sort(
-    sortOrder: Seq[SortOrder],
-    global: Boolean,
-    child: SparkPlan)
-  extends UnaryNode {
-  override def requiredChildDistribution: Seq[Distribution] =
-    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
-
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    child.execute().mapPartitions( { iterator =>
-      val ordering = newOrdering(sortOrder, child.output)
-      iterator.map(_.copy()).toArray.sorted(ordering).iterator
-    }, preservesPartitioning = true)
-  }
-
-  override def output: Seq[Attribute] = child.output
-
-  override def outputOrdering: Seq[SortOrder] = sortOrder
-}
-
 /**
  * Performs a sort, spilling to disk as needed.
  * @param global when true performs a global sort of all partitions by shuffling the data first
  *               if necessary.
  */
-case class ExternalSort(
+case class Sort(
     sortOrder: Seq[SortOrder],
     global: Boolean,
     child: SparkPlan)
@@ -93,7 +67,7 @@ case class ExternalSort(
 }
 
 /**
- * Optimized version of [[ExternalSort]] that operates on binary data (implemented as part of
+ * Optimized version of [[Sort]] that operates on binary data (implemented as part of
  * Project Tungsten).
  *
  * @param global when true performs a global sort of all partitions by shuffling the data first
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index f0b56c2eb7a53..a4dbd2e1978d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -74,16 +74,14 @@ private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution")
                 }}
               </li>
             }}
-            <li>
-              <strong>Detail: </strong><br/>
-              <pre>{executionUIData.physicalPlanDescription}</pre>
-            </li>
           </ul>
         </div>
 
       val metrics = listener.getExecutionMetrics(executionId)
 
-      summary ++ planVisualization(metrics, executionUIData.physicalPlanGraph)
+      summary ++
+        planVisualization(metrics, executionUIData.physicalPlanGraph) ++
+        physicalPlanDescription(executionUIData.physicalPlanDescription)
     }.getOrElse {
       <div>No information to display for Plan {executionId}</div>
     }
@@ -124,4 +122,23 @@ private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution")
 
   private def jobURL(jobId: Long): String =
     "%s/jobs/job?id=%s".format(UIUtils.prependBaseUri(parent.basePath), jobId)
+
+  private def physicalPlanDescription(physicalPlanDescription: String): Seq[Node] = {
+    <div>
+      <span style="cursor: pointer;" onclick="clickPhysicalPlanDetails();">
+        <span id="physical-plan-details-arrow" class="arrow-closed"></span>
+        <a>Details</a>
+      </span>
+    </div>
+    <div id="physical-plan-details" style="display: none;">
+      <pre>{physicalPlanDescription}</pre>
+    </div>
+    <script>
+      function clickPhysicalPlanDetails() {{
+        $('#physical-plan-details').toggle();
+        $('#physical-plan-details-arrow').toggleClass('arrow-open').toggleClass('arrow-closed');
+      }}
+    </script>
+    <br/>
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 435e6319a64c4..2467b4e48415b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -294,6 +294,33 @@ object functions {
    */
   def min(columnName: String): Column = min(Column(columnName))
 
+  /**
+   * Aggregate function: returns the unbiased sample standard deviation
+   * of the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev(e: Column): Column = Stddev(e.expr)
+
+  /**
+   * Aggregate function: returns the population standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_pop(e: Column): Column = StddevPop(e.expr)
+
+  /**
+   * Aggregate function: returns the unbiased sample standard deviation of
+   * the expression in a group.
+   *
+   * @group agg_funcs
+   * @since 1.6.0
+   */
+  def stddev_samp(e: Column): Column = StddevSamp(e.expr)
+
   /**
    * Aggregate function: returns the sum of all values in the expression.
    *
@@ -796,7 +823,7 @@ object functions {
    *
    * @group normal_funcs
    */
-  def expr(expr: String): Column = Column(new SqlParser().parseExpression(expr))
+  def expr(expr: String): Column = Column(SqlParser.parseExpression(expr))
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Math Functions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 8849fc2f1f0ef..c70fea1c3f50e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -88,6 +88,17 @@ abstract class JdbcDialect {
   def quoteIdentifier(colName: String): String = {
     s""""$colName""""
   }
+
+  /**
+   * Get the SQL query that should be used to find if the given table exists. Dialects can
+   * override this method to return a query that works best in a particular database.
+   * @param table  The name of the table.
+   * @return The SQL query to use for checking the table.
+   */
+  def getTableExistsQuery(table: String): String = {
+    s"SELECT * FROM $table WHERE 1=0"
+  }
+
 }
 
 /**
@@ -125,6 +136,9 @@ object JdbcDialects {
 
   registerDialect(MySQLDialect)
   registerDialect(PostgresDialect)
+  registerDialect(DB2Dialect)
+  registerDialect(MsSqlServerDialect)
+
 
   /**
    * Fetch the JdbcDialect class corresponding to a given database url.
@@ -197,6 +211,11 @@ case object PostgresDialect extends JdbcDialect {
     case BooleanType => Some(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN))
     case _ => None
   }
+
+  override def getTableExistsQuery(table: String): String = {
+    s"SELECT 1 FROM $table LIMIT 1"
+  }
+
 }
 
 /**
@@ -221,4 +240,41 @@ case object MySQLDialect extends JdbcDialect {
   override def quoteIdentifier(colName: String): String = {
     s"`$colName`"
   }
+
+  override def getTableExistsQuery(table: String): String = {
+    s"SELECT 1 FROM $table LIMIT 1"
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Default DB2 dialect, mapping string/boolean on write to valid DB2 types.
+ * By default string, and boolean gets mapped to db2 invalid types TEXT, and BIT(1).
+ */
+@DeveloperApi
+case object DB2Dialect extends JdbcDialect {
+
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:db2")
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    case StringType => Some(JdbcType("CLOB", java.sql.Types.CLOB))
+    case BooleanType => Some(JdbcType("CHAR(1)", java.sql.Types.CHAR))
+    case _ => None
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Default Microsoft SQL Server dialect, mapping the datetimeoffset types to a String on read.
+ */
+@DeveloperApi
+case object MsSqlServerDialect extends JdbcDialect {
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:sqlserver")
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    if (typeName.contains("datetimeoffset")) {
+      // String is recommend by Microsoft SQL Server for datetimeoffset types in non-MS clients
+      Some(StringType)
+    } else None
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index 2fdd798b44bb6..963e6030c14c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -39,22 +39,20 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] {
 
   override def pyUDT: String = "pyspark.sql.tests.ExamplePointUDT"
 
-  override def serialize(obj: Any): Seq[Double] = {
+  override def serialize(obj: Any): GenericArrayData = {
     obj match {
       case p: ExamplePoint =>
-        Seq(p.x, p.y)
+        val output = new Array[Any](2)
+        output(0) = p.x
+        output(1) = p.y
+        new GenericArrayData(output)
     }
   }
 
   override def deserialize(datum: Any): ExamplePoint = {
     datum match {
-      case values: Seq[_] =>
-        val xy = values.asInstanceOf[Seq[Double]]
-        assert(xy.length == 2)
-        new ExamplePoint(xy(0), xy(1))
-      case values: util.ArrayList[_] =>
-        val xy = values.asInstanceOf[util.ArrayList[Double]].asScala
-        new ExamplePoint(xy(0), xy(1))
+      case values: ArrayData =>
+        new ExamplePoint(values.getDouble(0), values.getDouble(1))
     }
   }
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
index bf693c7c393f6..7b50aad4ad498 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
@@ -18,6 +18,7 @@
 package test.org.apache.spark.sql;
 
 import java.io.Serializable;
+import java.math.BigDecimal;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -83,7 +84,7 @@ public void setAge(int age) {
 
   @Test
   public void applySchema() {
-    List<Person> personList = new ArrayList<Person>(2);
+    List<Person> personList = new ArrayList<>(2);
     Person person1 = new Person();
     person1.setName("Michael");
     person1.setAge(29);
@@ -95,12 +96,13 @@ public void applySchema() {
 
     JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
       new Function<Person, Row>() {
+        @Override
         public Row call(Person person) throws Exception {
           return RowFactory.create(person.getName(), person.getAge());
         }
       });
 
-    List<StructField> fields = new ArrayList<StructField>(2);
+    List<StructField> fields = new ArrayList<>(2);
     fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
     StructType schema = DataTypes.createStructType(fields);
@@ -118,7 +120,7 @@ public Row call(Person person) throws Exception {
 
   @Test
   public void dataFrameRDDOperations() {
-    List<Person> personList = new ArrayList<Person>(2);
+    List<Person> personList = new ArrayList<>(2);
     Person person1 = new Person();
     person1.setName("Michael");
     person1.setAge(29);
@@ -129,27 +131,28 @@ public void dataFrameRDDOperations() {
     personList.add(person2);
 
     JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
-            new Function<Person, Row>() {
-              public Row call(Person person) throws Exception {
-                return RowFactory.create(person.getName(), person.getAge());
-              }
-            });
-
-    List<StructField> fields = new ArrayList<StructField>(2);
-    fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
+        new Function<Person, Row>() {
+          @Override
+          public Row call(Person person) {
+            return RowFactory.create(person.getName(), person.getAge());
+          }
+        });
+
+    List<StructField> fields = new ArrayList<>(2);
+    fields.add(DataTypes.createStructField("", DataTypes.StringType, false));
     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
     StructType schema = DataTypes.createStructType(fields);
 
     DataFrame df = sqlContext.applySchema(rowRDD, schema);
     df.registerTempTable("people");
     List<String> actual = sqlContext.sql("SELECT * FROM people").toJavaRDD().map(new Function<Row, String>() {
-
+      @Override
       public String call(Row row) {
-        return row.getString(0) + "_" + row.get(1).toString();
+        return row.getString(0) + "_" + row.get(1);
       }
     }).collect();
 
-    List<String> expected = new ArrayList<String>(2);
+    List<String> expected = new ArrayList<>(2);
     expected.add("Michael_29");
     expected.add("Yin_28");
 
@@ -165,7 +168,7 @@ public void applySchemaToJSON() {
       "{\"string\":\"this is another simple string.\", \"integer\":11, \"long\":21474836469, " +
         "\"bigInteger\":92233720368547758069, \"double\":1.7976931348623157E305, " +
         "\"boolean\":false, \"null\":null}"));
-    List<StructField> fields = new ArrayList<StructField>(7);
+    List<StructField> fields = new ArrayList<>(7);
     fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(20, 0),
       true));
     fields.add(DataTypes.createStructField("boolean", DataTypes.BooleanType, true));
@@ -175,10 +178,10 @@ public void applySchemaToJSON() {
     fields.add(DataTypes.createStructField("null", DataTypes.StringType, true));
     fields.add(DataTypes.createStructField("string", DataTypes.StringType, true));
     StructType expectedSchema = DataTypes.createStructType(fields);
-    List<Row> expectedResult = new ArrayList<Row>(2);
+    List<Row> expectedResult = new ArrayList<>(2);
     expectedResult.add(
       RowFactory.create(
-        new java.math.BigDecimal("92233720368547758070"),
+        new BigDecimal("92233720368547758070"),
         true,
         1.7976931348623157E308,
         10,
@@ -187,7 +190,7 @@ public void applySchemaToJSON() {
         "this is a simple string."));
     expectedResult.add(
       RowFactory.create(
-        new java.math.BigDecimal("92233720368547758069"),
+        new BigDecimal("92233720368547758069"),
         false,
         1.7976931348623157E305,
         11,
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 4867cebf5328c..a1a3fdbb486ea 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -37,6 +37,7 @@
 import static org.apache.spark.sql.functions.*;
 import org.apache.spark.sql.test.TestSQLContext;
 import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
 
 public class JavaDataFrameSuite {
   private transient JavaSparkContext jsc;
@@ -61,7 +62,7 @@ public void tearDown() {
   @Test
   public void testExecution() {
     DataFrame df = context.table("testData").filter("key = 1");
-    Assert.assertEquals(df.select("key").collect()[0].get(0), 1);
+    Assert.assertEquals(1, df.select("key").collect()[0].get(0));
   }
 
   /**
@@ -90,6 +91,7 @@ public void testVarargMethods() {
     df.groupBy().mean("key");
     df.groupBy().max("key");
     df.groupBy().min("key");
+    df.groupBy().stddev("key");
     df.groupBy().sum("key");
 
     // Varargs in column expressions
@@ -119,7 +121,7 @@ public void testShow() {
 
   public static class Bean implements Serializable {
     private double a = 0.0;
-    private Integer[] b = new Integer[]{0, 1};
+    private Integer[] b = { 0, 1 };
     private Map<String, int[]> c = ImmutableMap.of("hello", new int[] { 1, 2 });
     private List<String> d = Arrays.asList("floppy", "disk");
 
@@ -140,11 +142,7 @@ public List<String> getD() {
     }
   }
 
-  @Test
-  public void testCreateDataFrameFromJavaBeans() {
-    Bean bean = new Bean();
-    JavaRDD<Bean> rdd = jsc.parallelize(Arrays.asList(bean));
-    DataFrame df = context.createDataFrame(rdd, Bean.class);
+  void validateDataFrameWithBeans(Bean bean, DataFrame df) {
     StructType schema = df.schema();
     Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()),
       schema.apply("a"));
@@ -161,7 +159,7 @@ public void testCreateDataFrameFromJavaBeans() {
       schema.apply("d"));
     Row first = df.select("a", "b", "c", "d").first();
     Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
-    // Now Java lists and maps are converetd to Scala Seq's and Map's. Once we get a Seq below,
+    // Now Java lists and maps are converted to Scala Seq's and Map's. Once we get a Seq below,
     // verify that it has the expected length, and contains expected elements.
     Seq<Integer> result = first.getAs(1);
     Assert.assertEquals(bean.getB().length, result.length());
@@ -180,7 +178,33 @@ public void testCreateDataFrameFromJavaBeans() {
     }
   }
 
-  private static Comparator<Row> CrosstabRowComparator = new Comparator<Row>() {
+  @Test
+  public void testCreateDataFrameFromLocalJavaBeans() {
+    Bean bean = new Bean();
+    List<Bean> data = Arrays.asList(bean);
+    DataFrame df = context.createDataFrame(data, Bean.class);
+    validateDataFrameWithBeans(bean, df);
+  }
+
+  @Test
+  public void testCreateDataFrameFromJavaBeans() {
+    Bean bean = new Bean();
+    JavaRDD<Bean> rdd = jsc.parallelize(Arrays.asList(bean));
+    DataFrame df = context.createDataFrame(rdd, Bean.class);
+    validateDataFrameWithBeans(bean, df);
+  }
+
+  @Test
+  public void testCreateDataFromFromList() {
+    StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true)));
+    List<Row> rows = Arrays.asList(RowFactory.create(0));
+    DataFrame df = context.createDataFrame(rows, schema);
+    Row[] result = df.collect();
+    Assert.assertEquals(1, result.length);
+  }
+
+  private static final Comparator<Row> crosstabRowComparator = new Comparator<Row>() {
+    @Override
     public int compare(Row row1, Row row2) {
       String item1 = row1.getString(0);
       String item2 = row2.getString(0);
@@ -193,16 +217,16 @@ public void testCrosstab() {
     DataFrame df = context.table("testData2");
     DataFrame crosstab = df.stat().crosstab("a", "b");
     String[] columnNames = crosstab.schema().fieldNames();
-    Assert.assertEquals(columnNames[0], "a_b");
-    Assert.assertEquals(columnNames[1], "1");
-    Assert.assertEquals(columnNames[2], "2");
+    Assert.assertEquals("a_b", columnNames[0]);
+    Assert.assertEquals("1", columnNames[1]);
+    Assert.assertEquals("2", columnNames[2]);
     Row[] rows = crosstab.collect();
-    Arrays.sort(rows, CrosstabRowComparator);
+    Arrays.sort(rows, crosstabRowComparator);
     Integer count = 1;
     for (Row row : rows) {
       Assert.assertEquals(row.get(0).toString(), count.toString());
-      Assert.assertEquals(row.getLong(1), 1L);
-      Assert.assertEquals(row.getLong(2), 1L);
+      Assert.assertEquals(1L, row.getLong(1));
+      Assert.assertEquals(1L, row.getLong(2));
       count++;
     }
   }
@@ -210,7 +234,7 @@ public void testCrosstab() {
   @Test
   public void testFrequentItems() {
     DataFrame df = context.table("testData2");
-    String[] cols = new String[]{"a"};
+    String[] cols = {"a"};
     DataFrame results = df.stat().freqItems(cols, 0.2);
     Assert.assertTrue(results.collect()[0].getSeq(0).contains(1));
   }
@@ -219,14 +243,14 @@ public void testFrequentItems() {
   public void testCorrelation() {
     DataFrame df = context.table("testData2");
     Double pearsonCorr = df.stat().corr("a", "b", "pearson");
-    Assert.assertTrue(Math.abs(pearsonCorr) < 1e-6);
+    Assert.assertTrue(Math.abs(pearsonCorr) < 1.0e-6);
   }
 
   @Test
   public void testCovariance() {
     DataFrame df = context.table("testData2");
     Double result = df.stat().cov("a", "b");
-    Assert.assertTrue(Math.abs(result) < 1e-6);
+    Assert.assertTrue(Math.abs(result) < 1.0e-6);
   }
 
   @Test
@@ -234,7 +258,7 @@ public void testSampleBy() {
     DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
     DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
     Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
-    Row[] expected = new Row[] {RowFactory.create(0, 5), RowFactory.create(1, 8)};
+    Row[] expected = {RowFactory.create(0, 5), RowFactory.create(1, 8)};
     Assert.assertArrayEquals(expected, actual);
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
index 4ce1d1dddb26a..3ab4db2a035d3 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
@@ -18,6 +18,7 @@
 package test.org.apache.spark.sql;
 
 import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
 import java.sql.Date;
 import java.sql.Timestamp;
 import java.util.Arrays;
@@ -52,12 +53,12 @@ public void setUp() {
     shortValue = (short)32767;
     intValue = 2147483647;
     longValue = 9223372036854775807L;
-    floatValue = (float)3.4028235E38;
+    floatValue = 3.4028235E38f;
     doubleValue = 1.7976931348623157E308;
     decimalValue = new BigDecimal("1.7976931348623157E328");
     booleanValue = true;
     stringValue = "this is a string";
-    binaryValue = stringValue.getBytes();
+    binaryValue = stringValue.getBytes(StandardCharsets.UTF_8);
     dateValue = Date.valueOf("2014-06-30");
     timestampValue = Timestamp.valueOf("2014-06-30 09:20:00.0");
   }
@@ -123,8 +124,8 @@ public void constructSimpleRow() {
     Assert.assertEquals(binaryValue, simpleRow.get(16));
     Assert.assertEquals(dateValue, simpleRow.get(17));
     Assert.assertEquals(timestampValue, simpleRow.get(18));
-    Assert.assertEquals(true, simpleRow.isNullAt(19));
-    Assert.assertEquals(null, simpleRow.get(19));
+    Assert.assertTrue(simpleRow.isNullAt(19));
+    Assert.assertNull(simpleRow.get(19));
   }
 
   @Test
@@ -134,7 +135,7 @@ public void constructComplexRow() {
       stringValue + " (1)", stringValue + " (2)", stringValue + "(3)");
 
     // Simple map
-    Map<String, Long> simpleMap = new HashMap<String, Long>();
+    Map<String, Long> simpleMap = new HashMap<>();
     simpleMap.put(stringValue + " (1)", longValue);
     simpleMap.put(stringValue + " (2)", longValue - 1);
     simpleMap.put(stringValue + " (3)", longValue - 2);
@@ -149,7 +150,7 @@ public void constructComplexRow() {
     List<Row> arrayOfRows = Arrays.asList(simpleStruct);
 
     // Complex map
-    Map<List<Row>, Row> complexMap = new HashMap<List<Row>, Row>();
+    Map<List<Row>, Row> complexMap = new HashMap<>();
     complexMap.put(arrayOfRows, simpleStruct);
 
     // Complex struct
@@ -167,7 +168,7 @@ public void constructComplexRow() {
     Assert.assertEquals(arrayOfMaps, complexStruct.get(3));
     Assert.assertEquals(arrayOfRows, complexStruct.get(4));
     Assert.assertEquals(complexMap, complexStruct.get(5));
-    Assert.assertEquals(null, complexStruct.get(6));
+    Assert.assertNull(complexStruct.get(6));
 
     // A very complex row
     Row complexRow = RowFactory.create(arrayOfMaps, arrayOfRows, complexMap, complexStruct);
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index bb02b58cca9be..4a78dca7fea66 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -20,6 +20,7 @@
 import java.io.Serializable;
 
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -61,13 +62,13 @@ public void udf1Test() {
 
     sqlContext.udf().register("stringLengthTest", new UDF1<String, Integer>() {
       @Override
-      public Integer call(String str) throws Exception {
+      public Integer call(String str) {
         return str.length();
       }
     }, DataTypes.IntegerType);
 
     Row result = sqlContext.sql("SELECT stringLengthTest('test')").head();
-    assert(result.getInt(0) == 4);
+    Assert.assertEquals(4, result.getInt(0));
   }
 
   @SuppressWarnings("unchecked")
@@ -81,12 +82,12 @@ public void udf2Test() {
 
     sqlContext.udf().register("stringLengthTest", new UDF2<String, String, Integer>() {
       @Override
-      public Integer call(String str1, String str2) throws Exception {
+      public Integer call(String str1, String str2) {
         return str1.length() + str2.length();
       }
     }, DataTypes.IntegerType);
 
     Row result = sqlContext.sql("SELECT stringLengthTest('test', 'test2')").head();
-    assert(result.getInt(0) == 9);
+    Assert.assertEquals(9, result.getInt(0));
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
index 6f9e7f68dc39c..9e241f20987c0 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -44,7 +44,7 @@ public class JavaSaveLoadSuite {
   File path;
   DataFrame df;
 
-  private void checkAnswer(DataFrame actual, List<Row> expected) {
+  private static void checkAnswer(DataFrame actual, List<Row> expected) {
     String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
     if (errorMessage != null) {
       Assert.fail(errorMessage);
@@ -64,7 +64,7 @@ public void setUp() throws IOException {
       path.delete();
     }
 
-    List<String> jsonObjects = new ArrayList<String>(10);
+    List<String> jsonObjects = new ArrayList<>(10);
     for (int i = 0; i < 10; i++) {
       jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
     }
@@ -82,7 +82,7 @@ public void tearDown() {
 
   @Test
   public void saveAndLoad() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save();
     DataFrame loadedDF = sqlContext.read().format("json").options(options).load();
@@ -91,11 +91,11 @@ public void saveAndLoad() {
 
   @Test
   public void saveAndLoadWithSchema() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write().format("json").mode(SaveMode.ErrorIfExists).options(options).save();
 
-    List<StructField> fields = new ArrayList<StructField>();
+    List<StructField> fields = new ArrayList<>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
     DataFrame loadedDF = sqlContext.read().format("json").schema(schema).options(options).load();
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index af7590c3d3c17..356d4ff3fa837 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.execution.PhysicalRDD
+
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
@@ -34,7 +36,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   def rddIdOf(tableName: String): Int = {
-    val executedPlan = ctx.table(tableName).queryExecution.executedPlan
+    val executedPlan = sqlContext.table(tableName).queryExecution.executedPlan
     executedPlan.collect {
       case InMemoryColumnarTableScan(_, _, relation) =>
         relation.cachedColumnBuffers.id
@@ -44,7 +46,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   }
 
   def isMaterialized(rddId: Int): Boolean = {
-    ctx.sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty
+    sparkContext.env.blockManager.get(RDDBlockId(rddId, 0)).nonEmpty
   }
 
   test("withColumn doesn't invalidate cached dataframe") {
@@ -69,41 +71,41 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
   test("cache temp table") {
     testData.select('key).registerTempTable("tempTable")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"), 0)
-    ctx.cacheTable("tempTable")
+    sqlContext.cacheTable("tempTable")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
-    ctx.uncacheTable("tempTable")
+    sqlContext.uncacheTable("tempTable")
   }
 
   test("unpersist an uncached table will not raise exception") {
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = true)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = false)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.persist()
-    assert(None != ctx.cacheManager.lookupCachedData(testData))
+    assert(None != sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = true)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
     testData.unpersist(blocking = false)
-    assert(None == ctx.cacheManager.lookupCachedData(testData))
+    assert(None == sqlContext.cacheManager.lookupCachedData(testData))
   }
 
   test("cache table as select") {
     sql("CACHE TABLE tempTable AS SELECT key FROM testData")
     assertCached(sql("SELECT COUNT(*) FROM tempTable"))
-    ctx.uncacheTable("tempTable")
+    sqlContext.uncacheTable("tempTable")
   }
 
   test("uncaching temp table") {
     testData.select('key).registerTempTable("tempTable1")
     testData.select('key).registerTempTable("tempTable2")
-    ctx.cacheTable("tempTable1")
+    sqlContext.cacheTable("tempTable1")
 
     assertCached(sql("SELECT COUNT(*) FROM tempTable1"))
     assertCached(sql("SELECT COUNT(*) FROM tempTable2"))
 
     // Is this valid?
-    ctx.uncacheTable("tempTable2")
+    sqlContext.uncacheTable("tempTable2")
 
     // Should this be cached?
     assertCached(sql("SELECT COUNT(*) FROM tempTable1"), 0)
@@ -111,103 +113,103 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("too big for memory") {
     val data = "*" * 1000
-    ctx.sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
+    sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF()
       .registerTempTable("bigData")
-    ctx.table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
-    assert(ctx.table("bigData").count() === 200000L)
-    ctx.table("bigData").unpersist(blocking = true)
+    sqlContext.table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
+    assert(sqlContext.table("bigData").count() === 200000L)
+    sqlContext.table("bigData").unpersist(blocking = true)
   }
 
   test("calling .cache() should use in-memory columnar caching") {
-    ctx.table("testData").cache()
-    assertCached(ctx.table("testData"))
-    ctx.table("testData").unpersist(blocking = true)
+    sqlContext.table("testData").cache()
+    assertCached(sqlContext.table("testData"))
+    sqlContext.table("testData").unpersist(blocking = true)
   }
 
   test("calling .unpersist() should drop in-memory columnar cache") {
-    ctx.table("testData").cache()
-    ctx.table("testData").count()
-    ctx.table("testData").unpersist(blocking = true)
-    assertCached(ctx.table("testData"), 0)
+    sqlContext.table("testData").cache()
+    sqlContext.table("testData").count()
+    sqlContext.table("testData").unpersist(blocking = true)
+    assertCached(sqlContext.table("testData"), 0)
   }
 
   test("isCached") {
-    ctx.cacheTable("testData")
+    sqlContext.cacheTable("testData")
 
-    assertCached(ctx.table("testData"))
-    assert(ctx.table("testData").queryExecution.withCachedData match {
+    assertCached(sqlContext.table("testData"))
+    assert(sqlContext.table("testData").queryExecution.withCachedData match {
       case _: InMemoryRelation => true
       case _ => false
     })
 
-    ctx.uncacheTable("testData")
-    assert(!ctx.isCached("testData"))
-    assert(ctx.table("testData").queryExecution.withCachedData match {
+    sqlContext.uncacheTable("testData")
+    assert(!sqlContext.isCached("testData"))
+    assert(sqlContext.table("testData").queryExecution.withCachedData match {
       case _: InMemoryRelation => false
       case _ => true
     })
   }
 
   test("SPARK-1669: cacheTable should be idempotent") {
-    assume(!ctx.table("testData").logicalPlan.isInstanceOf[InMemoryRelation])
+    assume(!sqlContext.table("testData").logicalPlan.isInstanceOf[InMemoryRelation])
 
-    ctx.cacheTable("testData")
-    assertCached(ctx.table("testData"))
+    sqlContext.cacheTable("testData")
+    assertCached(sqlContext.table("testData"))
 
     assertResult(1, "InMemoryRelation not found, testData should have been cached") {
-      ctx.table("testData").queryExecution.withCachedData.collect {
+      sqlContext.table("testData").queryExecution.withCachedData.collect {
         case r: InMemoryRelation => r
       }.size
     }
 
-    ctx.cacheTable("testData")
+    sqlContext.cacheTable("testData")
     assertResult(0, "Double InMemoryRelations found, cacheTable() is not idempotent") {
-      ctx.table("testData").queryExecution.withCachedData.collect {
+      sqlContext.table("testData").queryExecution.withCachedData.collect {
         case r @ InMemoryRelation(_, _, _, _, _: InMemoryColumnarTableScan, _) => r
       }.size
     }
 
-    ctx.uncacheTable("testData")
+    sqlContext.uncacheTable("testData")
   }
 
   test("read from cached table and uncache") {
-    ctx.cacheTable("testData")
-    checkAnswer(ctx.table("testData"), testData.collect().toSeq)
-    assertCached(ctx.table("testData"))
+    sqlContext.cacheTable("testData")
+    checkAnswer(sqlContext.table("testData"), testData.collect().toSeq)
+    assertCached(sqlContext.table("testData"))
 
-    ctx.uncacheTable("testData")
-    checkAnswer(ctx.table("testData"), testData.collect().toSeq)
-    assertCached(ctx.table("testData"), 0)
+    sqlContext.uncacheTable("testData")
+    checkAnswer(sqlContext.table("testData"), testData.collect().toSeq)
+    assertCached(sqlContext.table("testData"), 0)
   }
 
   test("correct error on uncache of non-cached table") {
     intercept[IllegalArgumentException] {
-      ctx.uncacheTable("testData")
+      sqlContext.uncacheTable("testData")
     }
   }
 
   test("SELECT star from cached table") {
     sql("SELECT * FROM testData").registerTempTable("selectStar")
-    ctx.cacheTable("selectStar")
+    sqlContext.cacheTable("selectStar")
     checkAnswer(
       sql("SELECT * FROM selectStar WHERE key = 1"),
       Seq(Row(1, "1")))
-    ctx.uncacheTable("selectStar")
+    sqlContext.uncacheTable("selectStar")
   }
 
   test("Self-join cached") {
     val unCachedAnswer =
       sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key").collect()
-    ctx.cacheTable("testData")
+    sqlContext.cacheTable("testData")
     checkAnswer(
       sql("SELECT * FROM testData a JOIN testData b ON a.key = b.key"),
       unCachedAnswer.toSeq)
-    ctx.uncacheTable("testData")
+    sqlContext.uncacheTable("testData")
   }
 
   test("'CACHE TABLE' and 'UNCACHE TABLE' SQL statement") {
     sql("CACHE TABLE testData")
-    assertCached(ctx.table("testData"))
+    assertCached(sqlContext.table("testData"))
 
     val rddId = rddIdOf("testData")
     assert(
@@ -215,7 +217,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
       "Eagerly cached in-memory table should have already been materialized")
 
     sql("UNCACHE TABLE testData")
-    assert(!ctx.isCached("testData"), "Table 'testData' should not be cached")
+    assert(!sqlContext.isCached("testData"), "Table 'testData' should not be cached")
 
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
@@ -224,14 +226,14 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("CACHE TABLE tableName AS SELECT * FROM anotherTable") {
     sql("CACHE TABLE testCacheTable AS SELECT * FROM testData")
-    assertCached(ctx.table("testCacheTable"))
+    assertCached(sqlContext.table("testCacheTable"))
 
     val rddId = rddIdOf("testCacheTable")
     assert(
       isMaterialized(rddId),
       "Eagerly cached in-memory table should have already been materialized")
 
-    ctx.uncacheTable("testCacheTable")
+    sqlContext.uncacheTable("testCacheTable")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -239,14 +241,14 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("CACHE TABLE tableName AS SELECT ...") {
     sql("CACHE TABLE testCacheTable AS SELECT key FROM testData LIMIT 10")
-    assertCached(ctx.table("testCacheTable"))
+    assertCached(sqlContext.table("testCacheTable"))
 
     val rddId = rddIdOf("testCacheTable")
     assert(
       isMaterialized(rddId),
       "Eagerly cached in-memory table should have already been materialized")
 
-    ctx.uncacheTable("testCacheTable")
+    sqlContext.uncacheTable("testCacheTable")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -254,7 +256,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("CACHE LAZY TABLE tableName") {
     sql("CACHE LAZY TABLE testData")
-    assertCached(ctx.table("testData"))
+    assertCached(sqlContext.table("testData"))
 
     val rddId = rddIdOf("testData")
     assert(
@@ -266,7 +268,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
       isMaterialized(rddId),
       "Lazily cached in-memory table should have been materialized")
 
-    ctx.uncacheTable("testData")
+    sqlContext.uncacheTable("testData")
     eventually(timeout(10 seconds)) {
       assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted")
     }
@@ -274,7 +276,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("InMemoryRelation statistics") {
     sql("CACHE TABLE testData")
-    ctx.table("testData").queryExecution.withCachedData.collect {
+    sqlContext.table("testData").queryExecution.withCachedData.collect {
       case cached: InMemoryRelation =>
         val actualSizeInBytes = (1 to 100).map(i => INT.defaultSize + i.toString.length + 4).sum
         assert(cached.statistics.sizeInBytes === actualSizeInBytes)
@@ -283,46 +285,48 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
   test("Drops temporary table") {
     testData.select('key).registerTempTable("t1")
-    ctx.table("t1")
-    ctx.dropTempTable("t1")
-    assert(intercept[RuntimeException](ctx.table("t1")).getMessage.startsWith("Table Not Found"))
+    sqlContext.table("t1")
+    sqlContext.dropTempTable("t1")
+    assert(
+      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
   }
 
   test("Drops cached temporary table") {
     testData.select('key).registerTempTable("t1")
     testData.select('key).registerTempTable("t2")
-    ctx.cacheTable("t1")
+    sqlContext.cacheTable("t1")
 
-    assert(ctx.isCached("t1"))
-    assert(ctx.isCached("t2"))
+    assert(sqlContext.isCached("t1"))
+    assert(sqlContext.isCached("t2"))
 
-    ctx.dropTempTable("t1")
-    assert(intercept[RuntimeException](ctx.table("t1")).getMessage.startsWith("Table Not Found"))
-    assert(!ctx.isCached("t2"))
+    sqlContext.dropTempTable("t1")
+    assert(
+      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
+    assert(!sqlContext.isCached("t2"))
   }
 
   test("Clear all cache") {
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
-    ctx.cacheTable("t1")
-    ctx.cacheTable("t2")
-    ctx.clearCache()
-    assert(ctx.cacheManager.isEmpty)
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
+    sqlContext.clearCache()
+    assert(sqlContext.cacheManager.isEmpty)
 
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
-    ctx.cacheTable("t1")
-    ctx.cacheTable("t2")
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
     sql("Clear CACHE")
-    assert(ctx.cacheManager.isEmpty)
+    assert(sqlContext.cacheManager.isEmpty)
   }
 
   test("Clear accumulators when uncacheTable to prevent memory leaking") {
     sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
     sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
 
-    ctx.cacheTable("t1")
-    ctx.cacheTable("t2")
+    sqlContext.cacheTable("t1")
+    sqlContext.cacheTable("t2")
 
     sql("SELECT * FROM t1").count()
     sql("SELECT * FROM t2").count()
@@ -331,9 +335,23 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
 
     Accumulators.synchronized {
       val accsSize = Accumulators.originals.size
-      ctx.uncacheTable("t1")
-      ctx.uncacheTable("t2")
+      sqlContext.uncacheTable("t1")
+      sqlContext.uncacheTable("t2")
       assert((accsSize - 2) == Accumulators.originals.size)
     }
   }
+
+  test("SPARK-10327 Cache Table is not working while subquery has alias in its project list") {
+    sparkContext.parallelize((1, 1) :: (2, 2) :: Nil)
+      .toDF("key", "value").selectExpr("key", "value", "key+1").registerTempTable("abc")
+    sqlContext.cacheTable("abc")
+
+    val sparkPlan = sql(
+      """select a.key, b.key, c.key from
+        |abc a join abc b on a.key=b.key
+        |join abc c on a.key=c.key""".stripMargin).queryExecution.sparkPlan
+
+    assert(sparkPlan.collect { case e: InMemoryColumnarTableScan => e }.size === 3)
+    assert(sparkPlan.collect { case e: PhysicalRDD => e }.size === 0)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 37738ec5b3c1d..4e988f074b113 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -29,7 +29,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   private lazy val booleanData = {
-    ctx.createDataFrame(ctx.sparkContext.parallelize(
+    sqlContext.createDataFrame(sparkContext.parallelize(
       Row(false, false) ::
       Row(false, true) ::
       Row(true, false) ::
@@ -286,7 +286,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("isNaN") {
-    val testData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+    val testData = sqlContext.createDataFrame(sparkContext.parallelize(
       Row(Double.NaN, Float.NaN) ::
       Row(math.log(-1), math.log(-3).toFloat) ::
       Row(null, null) ::
@@ -307,7 +307,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("nanvl") {
-    val testData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+    val testData = sqlContext.createDataFrame(sparkContext.parallelize(
       Row(null, 3.0, Double.NaN, Double.PositiveInfinity, 1.0f, 4) :: Nil),
       StructType(Seq(StructField("a", DoubleType), StructField("b", DoubleType),
         StructField("c", DoubleType), StructField("d", DoubleType),
@@ -350,7 +350,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("!==") {
-    val nullData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+    val nullData = sqlContext.createDataFrame(sparkContext.parallelize(
       Row(1, 1) ::
       Row(1, 2) ::
       Row(1, null) ::
@@ -411,7 +411,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
   }
 
   test("between") {
-    val testData = ctx.sparkContext.parallelize(
+    val testData = sparkContext.parallelize(
       (0, 1, 2) ::
       (1, 2, 3) ::
       (2, 1, 0) ::
@@ -556,7 +556,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
 
   test("monotonicallyIncreasingId") {
     // Make sure we have 2 partitions, each with 2 records.
-    val df = ctx.sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
+    val df = sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
       Iterator(Tuple1(1), Tuple1(2))
     }.toDF("a")
     checkAnswer(
@@ -567,7 +567,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
 
   test("sparkPartitionId") {
     // Make sure we have 2 partitions, each with 2 records.
-    val df = ctx.sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
+    val df = sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
       Iterator(Tuple1(1), Tuple1(2))
     }.toDF("a")
     checkAnswer(
@@ -578,7 +578,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
 
   test("InputFileName") {
     withTempPath { dir =>
-      val data = sqlContext.sparkContext.parallelize(0 to 10).toDF("id")
+      val data = sparkContext.parallelize(0 to 10).toDF("id")
       data.write.parquet(dir.getCanonicalPath)
       val answer = sqlContext.read.parquet(dir.getCanonicalPath).select(inputFileName())
         .head.getString(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 72cf7aab0b977..f5ef9ffd7f4f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -66,12 +66,12 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       Seq(Row(1, 3), Row(2, 3), Row(3, 3))
     )
 
-    ctx.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, false)
+    sqlContext.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, false)
     checkAnswer(
       testData2.groupBy("a").agg(sum($"b")),
       Seq(Row(3), Row(3), Row(3))
     )
-    ctx.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, true)
+    sqlContext.conf.setConf(SQLConf.DATAFRAME_RETAIN_GROUP_COLUMNS, true)
   }
 
   test("agg without groups") {
@@ -175,6 +175,39 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       Row(0, null))
   }
 
+  test("stddev") {
+    val testData2ADev = math.sqrt(4/5.0)
+
+    checkAnswer(
+      testData2.agg(stddev('a)),
+      Row(testData2ADev))
+
+    checkAnswer(
+      testData2.agg(stddev_pop('a)),
+      Row(math.sqrt(4/6.0)))
+
+    checkAnswer(
+      testData2.agg(stddev_samp('a)),
+      Row(testData2ADev))
+  }
+
+  test("zero stddev") {
+    val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
+    assert(emptyTableData.count() == 0)
+
+    checkAnswer(
+    emptyTableData.agg(stddev('a)),
+    Row(null))
+
+    checkAnswer(
+    emptyTableData.agg(stddev_pop('a)),
+    Row(null))
+
+    checkAnswer(
+    emptyTableData.agg(stddev_samp('a)),
+    Row(null))
+  }
+
   test("zero sum") {
     val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala
index 3c359dd840ab7..09f7b507670c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala
@@ -28,19 +28,19 @@ class DataFrameComplexTypeSuite extends QueryTest with SharedSQLContext {
 
   test("UDF on struct") {
     val f = udf((a: String) => a)
-    val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
+    val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
     df.select(struct($"a").as("s")).select(f($"s.a")).collect()
   }
 
   test("UDF on named_struct") {
     val f = udf((a: String) => a)
-    val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
+    val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
     df.selectExpr("named_struct('a', a) s").select(f($"s.a")).collect()
   }
 
   test("UDF on array") {
     val f = udf((a: String) => a)
-    val df = sqlContext.sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
+    val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b")
     df.select(array($"a").as("s")).select(f(expr("s[0]"))).collect()
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
index e5d7d63441a6b..094efbaeadcd5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -24,7 +24,7 @@ class DataFrameImplicitsSuite extends QueryTest with SharedSQLContext {
 
   test("RDD of tuples") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
+      sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
       (1 to 10).map(i => Row(i, i.toString)))
   }
 
@@ -36,19 +36,19 @@ class DataFrameImplicitsSuite extends QueryTest with SharedSQLContext {
 
   test("RDD[Int]") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1 to 10).toDF("intCol"),
+      sparkContext.parallelize(1 to 10).toDF("intCol"),
       (1 to 10).map(i => Row(i)))
   }
 
   test("RDD[Long]") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1L to 10L).toDF("longCol"),
+      sparkContext.parallelize(1L to 10L).toDF("longCol"),
       (1L to 10L).map(i => Row(i)))
   }
 
   test("RDD[String]") {
     checkAnswer(
-      ctx.sparkContext.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
+      sparkContext.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
       (1 to 10).map(i => Row(i.toString)))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index e2716d7841d85..56ad71ea4f487 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -42,6 +42,19 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
       Row(1, 2, "1", "2") :: Row(2, 3, "2", "3") :: Row(3, 4, "3", "4") :: Nil)
   }
 
+  test("join - join using multiple columns and specifying join type") {
+    val df = Seq(1, 2, 3).map(i => (i, i + 1, i.toString)).toDF("int", "int2", "str")
+    val df2 = Seq(1, 2, 3).map(i => (i, i + 1, (i + 1).toString)).toDF("int", "int2", "str")
+
+    checkAnswer(
+      df.join(df2, Seq("int", "str"), "left"),
+      Row(1, 2, "1", null) :: Row(2, 3, "2", null) :: Row(3, 4, "3", null) :: Nil)
+
+    checkAnswer(
+      df.join(df2, Seq("int", "str"), "right"),
+      Row(null, null, null, 2) :: Row(null, null, null, 3) :: Row(null, null, null, 4) :: Nil)
+  }
+
   test("join - join using self join") {
     val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 28bdd6f83b687..6524abcf5e97f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -29,7 +29,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("sample with replacement") {
     val n = 100
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sparkContext.parallelize(1 to n, 2).toDF("id")
     checkAnswer(
       data.sample(withReplacement = true, 0.05, seed = 13),
       Seq(5, 10, 52, 73).map(Row(_))
@@ -38,7 +38,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("sample without replacement") {
     val n = 100
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sparkContext.parallelize(1 to n, 2).toDF("id")
     checkAnswer(
       data.sample(withReplacement = false, 0.05, seed = 13),
       Seq(16, 23, 88, 100).map(Row(_))
@@ -47,7 +47,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("randomSplit") {
     val n = 600
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sparkContext.parallelize(1 to n, 2).toDF("id")
     for (seed <- 1 to 5) {
       val splits = data.randomSplit(Array[Double](1, 2, 3), seed)
       assert(splits.length == 3, "wrong number of splits")
@@ -164,7 +164,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
   }
 
   test("Frequent Items 2") {
-    val rows = ctx.sparkContext.parallelize(Seq.empty[Int], 4)
+    val rows = sparkContext.parallelize(Seq.empty[Int], 4)
     // this is a regression test, where when merging partitions, we omitted values with higher
     // counts than those that existed in the map when the map was full. This test should also fail
     // if anything like SPARK-9614 is observed once again
@@ -182,7 +182,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
   }
 
   test("sampleBy") {
-    val df = ctx.range(0, 100).select((col("id") % 3).as("key"))
+    val df = sqlContext.range(0, 100).select((col("id") % 3).as("key"))
     val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L)
     checkAnswer(
       sampled.groupBy("key").count().orderBy("key"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 284fff184085a..d919877746c72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -22,6 +22,8 @@ import java.io.File
 import scala.language.postfixOps
 import scala.util.Random
 
+import org.scalatest.Matchers._
+
 import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
@@ -345,7 +347,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("replace column using withColumn") {
-    val df2 = sqlContext.sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
+    val df2 = sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
     val df3 = df2.withColumn("x", df2("x") + 1)
     checkAnswer(
       df3.select("x"),
@@ -434,7 +436,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val describeResult = Seq(
       Row("count", "4", "4"),
       Row("mean", "33.0", "178.0"),
-      Row("stddev", "16.583123951777", "10.0"),
+      Row("stddev", "19.148542155126762", "11.547005383792516"),
       Row("min", "16", "164"),
       Row("max", "60", "192"))
 
@@ -506,7 +508,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
   test("showString: truncate = [true, false]") {
     val longString = Array.fill(21)("1").mkString
-    val df = sqlContext.sparkContext.parallelize(Seq("1", longString)).toDF()
+    val df = sparkContext.parallelize(Seq("1", longString)).toDF()
     val expectedAnswerForFalse = """+---------------------+
                                    ||_1                   |
                                    |+---------------------+
@@ -596,7 +598,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
-    val rowRDD = sqlContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
+    val rowRDD = sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
     val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
     val df = sqlContext.createDataFrame(rowRDD, schema)
     df.rdd.collect()
@@ -619,14 +621,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
-    val df = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    val df = sqlContext.read.json(sparkContext.makeRDD(
       """{"a.b": {"c": {"d..e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df.select(df("`a.b`.c.`d..e`.`f`")),
       Row(1)
     )
 
-    val df2 = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    val df2 = sqlContext.read.json(sparkContext.makeRDD(
       """{"a  b": {"c": {"d  e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df2.select(df2("`a  b`.c.d  e.f")),
@@ -646,7 +648,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-7324 dropDuplicates") {
-    val testData = sqlContext.sparkContext.parallelize(
+    val testData = sparkContext.parallelize(
       (2, 1, 2) :: (1, 1, 1) ::
       (1, 2, 1) :: (2, 1, 2) ::
       (2, 2, 2) :: (2, 2, 1) ::
@@ -869,7 +871,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-9323: DataFrame.orderBy should support nested column name") {
-    val df = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    val df = sqlContext.read.json(sparkContext.makeRDD(
       """{"a": {"b": 1}}""" :: Nil))
     checkAnswer(df.orderBy("a.b"), Row(Row(1)))
   }
@@ -887,4 +889,72 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       .select(struct($"b"))
       .collect()
   }
+
+  test("SPARK-10034: Sort on Aggregate with aggregation expression named 'aggOrdering'") {
+    val df = Seq(1 -> 2).toDF("i", "j")
+    val query = df.groupBy('i)
+      .agg(max('j).as("aggOrdering"))
+      .orderBy(sum('j))
+    checkAnswer(query, Row(1, 2))
+  }
+
+  test("SPARK-10316: respect non-deterministic expressions in PhysicalOperation") {
+    val input = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+      (1 to 10).map(i => s"""{"id": $i}""")))
+
+    val df = input.select($"id", rand(0).as('r))
+    df.as("a").join(df.filter($"r" < 0.5).as("b"), $"a.id" === $"b.id").collect().foreach { row =>
+      assert(row.getDouble(1) - row.getDouble(3) === 0.0 +- 0.001)
+    }
+  }
+
+  test("SPARK-10539: Project should not be pushed down through Intersect or Except") {
+    val df1 = (1 to 100).map(Tuple1.apply).toDF("i")
+    val df2 = (1 to 30).map(Tuple1.apply).toDF("i")
+    val intersect = df1.intersect(df2)
+    val except = df1.except(df2)
+    assert(intersect.count() === 30)
+    assert(except.count() === 70)
+  }
+
+  test("SPARK-10740: handle nondeterministic expressions correctly for set operations") {
+    val df1 = (1 to 20).map(Tuple1.apply).toDF("i")
+    val df2 = (1 to 10).map(Tuple1.apply).toDF("i")
+
+    // When generating expected results at here, we need to follow the implementation of
+    // Rand expression.
+    def expected(df: DataFrame): Seq[Row] = {
+      df.rdd.collectPartitions().zipWithIndex.flatMap {
+        case (data, index) =>
+          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
+          data.filter(_.getInt(0) < rng.nextDouble() * 10)
+      }
+    }
+
+    val union = df1.unionAll(df2)
+    checkAnswer(
+      union.filter('i < rand(7) * 10),
+      expected(union)
+    )
+    checkAnswer(
+      union.select(rand(7)),
+      union.rdd.collectPartitions().zipWithIndex.flatMap {
+        case (data, index) =>
+          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
+          data.map(_ => rng.nextDouble()).map(i => Row(i))
+      }
+    )
+
+    val intersect = df1.intersect(df2)
+    checkAnswer(
+      intersect.filter('i < rand(7) * 10),
+      expected(intersect)
+    )
+
+    val except = df1.except(df2)
+    checkAnswer(
+      except.filter('i < rand(7) * 10),
+      expected(except)
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
index 77907e91363ec..7ae12a7895f7e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
@@ -32,7 +32,7 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
 
   test("test simple types") {
     withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
-      val df = sqlContext.sparkContext.parallelize(Seq((1, 2))).toDF("a", "b")
+      val df = sparkContext.parallelize(Seq((1, 2))).toDF("a", "b")
       assert(df.select(struct("a", "b")).first().getStruct(0) === Row(1, 2))
     }
   }
@@ -40,7 +40,7 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
   test("test struct type") {
     withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
       val struct = Row(1, 2L, 3.0F, 3.0)
-      val data = sqlContext.sparkContext.parallelize(Seq(Row(1, struct)))
+      val data = sparkContext.parallelize(Seq(Row(1, struct)))
 
       val schema = new StructType()
         .add("a", IntegerType)
@@ -60,7 +60,7 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
     withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
       val innerStruct = Row(1, "abcd")
       val outerStruct = Row(1, 2L, 3.0F, 3.0, innerStruct, "efg")
-      val data = sqlContext.sparkContext.parallelize(Seq(Row(1, outerStruct)))
+      val data = sparkContext.parallelize(Seq(Row(1, outerStruct)))
 
       val schema = new StructType()
         .add("a", IntegerType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
new file mode 100644
index 0000000000000..78a98798eff64
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExtraStrategiesSuite.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute}
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.{Row, Strategy, QueryTest}
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.unsafe.types.UTF8String
+
+case class FastOperator(output: Seq[Attribute]) extends SparkPlan {
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    val str = Literal("so fast").value
+    val row = new GenericInternalRow(Array[Any](str))
+    sparkContext.parallelize(Seq(row))
+  }
+
+  override def children: Seq[SparkPlan] = Nil
+}
+
+object TestStrategy extends Strategy {
+  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case Project(Seq(attr), _) if attr.name == "a" =>
+      FastOperator(attr.toAttribute :: Nil) :: Nil
+    case _ => Nil
+  }
+}
+
+class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("insert an extraStrategy") {
+    try {
+      sqlContext.experimental.extraStrategies = TestStrategy :: Nil
+
+      val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
+      checkAnswer(
+        df.select("a"),
+        Row("so fast"))
+
+      checkAnswer(
+        df.select("a", "b"),
+        Row("so slow", 1))
+    } finally {
+      sqlContext.experimental.extraStrategies = Nil
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index f5c5046a8ed88..7a027e13089e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -31,7 +31,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     val x = testData2.as("x")
     val y = testData2.as("y")
     val join = x.join(y, $"x.a" === $"y.a", "inner").queryExecution.optimizedPlan
-    val planned = ctx.planner.EquiJoinSelection(join)
+    val planned = sqlContext.planner.EquiJoinSelection(join)
     assert(planned.size === 1)
   }
 
@@ -59,7 +59,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
 
     Seq(
       ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]),
@@ -83,7 +83,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
         classOf[SortMergeOuterJoin]),
       ("SELECT * FROM testData full outer join testData2 ON key = a",
-        classOf[ShuffledHashOuterJoin]),
+        classOf[SortMergeOuterJoin]),
       ("SELECT * FROM testData left JOIN testData2 ON (key * a != key + a)",
         classOf[BroadcastNestedLoopJoin]),
       ("SELECT * FROM testData right JOIN testData2 ON (key * a != key + a)",
@@ -118,7 +118,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("broadcasted hash join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
     sql("CACHE TABLE testData")
     for (sortMergeJoinEnabled <- Seq(true, false)) {
       withClue(s"sortMergeJoinEnabled=$sortMergeJoinEnabled") {
@@ -138,7 +138,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("broadcasted hash outer join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
     sql("CACHE TABLE testData")
     withSQLConf(SQLConf.SORTMERGE_JOIN.key -> "true") {
       Seq(
@@ -167,7 +167,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     val x = testData2.as("x")
     val y = testData2.as("y")
     val join = x.join(y, ($"x.a" === $"y.a") && ($"x.b" === $"y.b")).queryExecution.optimizedPlan
-    val planned = ctx.planner.EquiJoinSelection(join)
+    val planned = sqlContext.planner.EquiJoinSelection(join)
     assert(planned.size === 1)
   }
 
@@ -442,7 +442,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("broadcasted left semi join operator selection") {
-    ctx.cacheManager.clearCache()
+    sqlContext.cacheManager.clearCache()
     sql("CACHE TABLE testData")
 
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1000000000") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 045fea82e4c89..e3531d0d6d799 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -29,4 +29,42 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("alice", "5"))
   }
 
+
+  val tuples: Seq[(String, String)] =
+    ("1", """{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") ::
+    ("2", """{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
+    ("3", """{"f1": "value13", "f4": "value44", "f3": "value33", "f2": 2, "f5": 5.01}""") ::
+    ("4", null) ::
+    ("5", """{"f1": "", "f5": null}""") ::
+    ("6", "[invalid JSON string]") ::
+    Nil
+
+  test("json_tuple select") {
+    val df: DataFrame = tuples.toDF("key", "jstring")
+    val expected = Row("1", Row("value1", "value2", "3", null, "5.23")) ::
+      Row("2", Row("value12", "2", "value3", "4.01", null)) ::
+      Row("3", Row("value13", "2", "value33", "value44", "5.01")) ::
+      Row("4", Row(null, null, null, null, null)) ::
+      Row("5", Row("", null, null, null, null)) ::
+      Row("6", Row(null, null, null, null, null)) ::
+      Nil
+
+    checkAnswer(df.selectExpr("key", "json_tuple(jstring, 'f1', 'f2', 'f3', 'f4', 'f5')"), expected)
+  }
+
+  test("json_tuple filter and group") {
+    val df: DataFrame = tuples.toDF("key", "jstring")
+    val expr = df
+      .selectExpr("json_tuple(jstring, 'f1', 'f2') as jt")
+      .where($"jt.c0".isNotNull)
+      .groupBy($"jt.c1")
+      .count()
+
+    val expected = Row(null, 1) ::
+      Row("2", 2) ::
+      Row("value2", 1) ::
+      Nil
+
+    checkAnswer(expr, expected)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index babf8835d2545..eab0fbb196eb6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -32,33 +32,33 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
   }
 
   after {
-    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
   }
 
   test("get all tables") {
     checkAnswer(
-      ctx.tables().filter("tableName = 'ListTablesSuiteTable'"),
+      sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
     checkAnswer(
       sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("getting all Tables with a database name has no impact on returned table names") {
     checkAnswer(
-      ctx.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
+      sqlContext.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
     checkAnswer(
       sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
-    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("query the returned DataFrame of tables") {
@@ -66,7 +66,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
       StructField("tableName", StringType, false) ::
       StructField("isTemporary", BooleanType, false) :: Nil)
 
-    Seq(ctx.tables(), sql("SHOW TABLes")).foreach {
+    Seq(sqlContext.tables(), sql("SHOW TABLes")).foreach {
       case tableDF =>
         assert(expectedSchema === tableDF.schema)
 
@@ -77,9 +77,9 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
           Row(true, "ListTablesSuiteTable")
         )
         checkAnswer(
-          ctx.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
+          sqlContext.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
           Row("tables", true))
-        ctx.dropTempTable("tables")
+        sqlContext.dropTempTable("tables")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 30289c3c1d097..58f982c2bc932 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -37,9 +37,11 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   private lazy val nullDoubles =
     Seq(NullDoubles(1.0), NullDoubles(2.0), NullDoubles(3.0), NullDoubles(null)).toDF()
 
-  private def testOneToOneMathFunction[@specialized(Int, Long, Float, Double) T](
+  private def testOneToOneMathFunction[
+  @specialized(Int, Long, Float, Double) T,
+  @specialized(Int, Long, Float, Double) U](
       c: Column => Column,
-      f: T => T): Unit = {
+      f: T => U): Unit = {
     checkAnswer(
       doubleData.select(c('a)),
       (1 to 10).map(n => Row(f((n * 0.2 - 1).asInstanceOf[T])))
@@ -165,10 +167,10 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("ceil and ceiling") {
-    testOneToOneMathFunction(ceil, math.ceil)
+    testOneToOneMathFunction(ceil, (d: Double) => math.ceil(d).toLong)
     checkAnswer(
       sql("SELECT ceiling(0), ceiling(1), ceiling(1.5)"),
-      Row(0.0, 1.0, 2.0))
+      Row(0L, 1L, 2L))
   }
 
   test("conv") {
@@ -184,7 +186,7 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("floor") {
-    testOneToOneMathFunction(floor, math.floor)
+    testOneToOneMathFunction(floor, (d: Double) => math.floor(d).toLong)
   }
 
   test("factorial") {
@@ -228,7 +230,7 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("signum / sign") {
-    testOneToOneMathFunction[Double](signum, math.signum)
+    testOneToOneMathFunction[Double, Double](signum, math.signum)
 
     checkAnswer(
       sql("SELECT sign(10), signum(-11)"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index 3649c2a97b5ef..e3c5a426671d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -25,7 +25,9 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.columnar.InMemoryRelation
 
-class QueryTest extends PlanTest {
+abstract class QueryTest extends PlanTest {
+
+  protected def sqlContext: SQLContext
 
   // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
   TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
@@ -56,18 +58,33 @@ class QueryTest extends PlanTest {
    * @param df the [[DataFrame]] to be executed
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
-  protected def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Unit = {
-    QueryTest.checkAnswer(df, expectedAnswer) match {
+  protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
+    val analyzedDF = try df catch {
+      case ae: AnalysisException =>
+        val currentValue = sqlContext.conf.dataFrameEagerAnalysis
+        sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, false)
+        val partiallyAnalzyedPlan = df.queryExecution.analyzed
+        sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, currentValue)
+        fail(
+          s"""
+             |Failed to analyze query: $ae
+             |$partiallyAnalzyedPlan
+             |
+             |${stackTraceToString(ae)}
+             |""".stripMargin)
+    }
+
+    QueryTest.checkAnswer(analyzedDF, expectedAnswer) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
   }
 
-  protected def checkAnswer(df: DataFrame, expectedAnswer: Row): Unit = {
+  protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = {
     checkAnswer(df, Seq(expectedAnswer))
   }
 
-  protected def checkAnswer(df: DataFrame, expectedAnswer: DataFrame): Unit = {
+  protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = {
     checkAnswer(df, expectedAnswer.collect())
   }
 
@@ -98,19 +115,26 @@ object QueryTest {
    */
   def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
     val isSorted = df.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
+
+    // We need to call prepareRow recursively to handle schemas with struct types.
+    def prepareRow(row: Row): Row = {
+      Row.fromSeq(row.toSeq.map {
+        case null => null
+        case d: java.math.BigDecimal => BigDecimal(d)
+        // Convert array to Seq for easy equality check.
+        case b: Array[_] => b.toSeq
+        case r: Row => prepareRow(r)
+        case o => o
+      })
+    }
+
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
       // For BigDecimal type, the Scala type has a better definition of equality test (similar to
       // Java's java.math.BigDecimal.compareTo).
       // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
       // equality test.
-      val converted: Seq[Row] = answer.map { s =>
-        Row.fromSeq(s.toSeq.map {
-          case d: java.math.BigDecimal => BigDecimal(d)
-          case b: Array[Byte] => b.toSeq
-          case o => o
-        })
-      }
+      val converted: Seq[Row] = answer.map(prepareRow)
       if (!isSorted) converted.sortBy(_.toString()) else converted
     }
     val sparkAnswer = try df.collect().toSeq catch {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 77ccd6f775e50..3ba14d7602a62 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -57,7 +57,7 @@ class RowSuite extends SparkFunSuite with SharedSQLContext {
 
   test("serialize w/ kryo") {
     val row = Seq((1, Seq(1), Map(1 -> 1), BigDecimal(1))).toDF().first()
-    val serializer = new SparkSqlSerializer(ctx.sparkContext.getConf)
+    val serializer = new SparkSqlSerializer(sparkContext.getConf)
     val instance = serializer.newInstance()
     val ser = instance.serialize(row)
     val de = instance.deserialize(ser).asInstanceOf[Row]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
index 7699adadd9cc8..3d2bd236ceead 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.test.{TestSQLContext, SharedSQLContext}
 
 
 class SQLConfSuite extends QueryTest with SharedSQLContext {
@@ -27,58 +27,67 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
   test("propagate from spark conf") {
     // We create a new context here to avoid order dependence with other tests that might call
     // clear().
-    val newContext = new SQLContext(ctx.sparkContext)
+    val newContext = new SQLContext(sparkContext)
     assert(newContext.getConf("spark.sql.testkey", "false") === "true")
   }
 
   test("programmatic ways of basic setting and getting") {
-    ctx.conf.clear()
-    assert(ctx.getAllConfs.size === 0)
-
-    ctx.setConf(testKey, testVal)
-    assert(ctx.getConf(testKey) === testVal)
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
-    assert(ctx.getAllConfs.contains(testKey))
+    // Set a conf first.
+    sqlContext.setConf(testKey, testVal)
+    // Clear the conf.
+    sqlContext.conf.clear()
+    // After clear, only overrideConfs used by unit test should be in the SQLConf.
+    assert(sqlContext.getAllConfs === TestSQLContext.overrideConfs)
+
+    sqlContext.setConf(testKey, testVal)
+    assert(sqlContext.getConf(testKey) === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getAllConfs.contains(testKey))
 
     // Tests SQLConf as accessed from a SQLContext is mutable after
     // the latter is initialized, unlike SparkConf inside a SparkContext.
-    assert(ctx.getConf(testKey) == testVal)
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
-    assert(ctx.getAllConfs.contains(testKey))
+    assert(sqlContext.getConf(testKey) === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getAllConfs.contains(testKey))
 
-    ctx.conf.clear()
+    sqlContext.conf.clear()
   }
 
   test("parse SQL set commands") {
-    ctx.conf.clear()
+    sqlContext.conf.clear()
     sql(s"set $testKey=$testVal")
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
-    assert(ctx.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
+    assert(sqlContext.getConf(testKey, testVal + "_") === testVal)
 
     sql("set some.property=20")
-    assert(ctx.getConf("some.property", "0") === "20")
+    assert(sqlContext.getConf("some.property", "0") === "20")
     sql("set some.property = 40")
-    assert(ctx.getConf("some.property", "0") === "40")
+    assert(sqlContext.getConf("some.property", "0") === "40")
 
     val key = "spark.sql.key"
     val vs = "val0,val_1,val2.3,my_table"
     sql(s"set $key=$vs")
-    assert(ctx.getConf(key, "0") === vs)
+    assert(sqlContext.getConf(key, "0") === vs)
 
     sql(s"set $key=")
-    assert(ctx.getConf(key, "0") === "")
+    assert(sqlContext.getConf(key, "0") === "")
 
-    ctx.conf.clear()
+    sqlContext.conf.clear()
   }
 
   test("deprecated property") {
-    ctx.conf.clear()
-    sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
-    assert(ctx.conf.numShufflePartitions === 10)
+    sqlContext.conf.clear()
+    val original = sqlContext.conf.numShufflePartitions
+    try{
+      sql(s"set ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}=10")
+      assert(sqlContext.conf.numShufflePartitions === 10)
+    } finally {
+      sql(s"set ${SQLConf.SHUFFLE_PARTITIONS}=$original")
+    }
   }
 
   test("invalid conf value") {
-    ctx.conf.clear()
+    sqlContext.conf.clear()
     val e = intercept[IllegalArgumentException] {
       sql(s"set ${SQLConf.CASE_SENSITIVE.key}=10")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index 007be12950774..dd88ae3700ab9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -24,7 +24,7 @@ class SQLContextSuite extends SparkFunSuite with SharedSQLContext {
 
   override def afterAll(): Unit = {
     try {
-      SQLContext.setLastInstantiatedContext(ctx)
+      SQLContext.setLastInstantiatedContext(sqlContext)
     } finally {
       super.afterAll()
     }
@@ -32,18 +32,18 @@ class SQLContextSuite extends SparkFunSuite with SharedSQLContext {
 
   test("getOrCreate instantiates SQLContext") {
     SQLContext.clearLastInstantiatedContext()
-    val sqlContext = SQLContext.getOrCreate(ctx.sparkContext)
+    val sqlContext = SQLContext.getOrCreate(sparkContext)
     assert(sqlContext != null, "SQLContext.getOrCreate returned null")
-    assert(SQLContext.getOrCreate(ctx.sparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
       "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
   }
 
   test("getOrCreate gets last explicitly instantiated SQLContext") {
     SQLContext.clearLastInstantiatedContext()
-    val sqlContext = new SQLContext(ctx.sparkContext)
-    assert(SQLContext.getOrCreate(ctx.sparkContext) != null,
+    val sqlContext = new SQLContext(sparkContext)
+    assert(SQLContext.getOrCreate(sparkContext) != null,
       "SQLContext.getOrCreate after explicitly created SQLContext returned null")
-    assert(SQLContext.getOrCreate(ctx.sparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
       "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 9e172b2c264cb..eca6f1073889a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestData._
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.test.{SharedSQLContext, TestSQLContext}
 import org.apache.spark.sql.types._
 
 /** A SQL Dialect for testing purpose, and it can not be nested type */
@@ -147,14 +147,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SQL Dialect Switching to a new SQL parser") {
-    val newContext = new SQLContext(sqlContext.sparkContext)
+    val newContext = new SQLContext(sparkContext)
     newContext.setConf("spark.sql.dialect", classOf[MyDialect].getCanonicalName())
     assert(newContext.getSQLDialect().getClass === classOf[MyDialect])
     assert(newContext.sql("SELECT 1").collect() === Array(Row(1)))
   }
 
   test("SQL Dialect Switch to an invalid parser with alias") {
-    val newContext = new SQLContext(sqlContext.sparkContext)
+    val newContext = new SQLContext(sparkContext)
     newContext.sql("SET spark.sql.dialect=MyTestClass")
     intercept[DialectException] {
       newContext.sql("SELECT 1")
@@ -196,7 +196,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("grouping on nested fields") {
-    sqlContext.read.json(sqlContext.sparkContext.parallelize(
+    sqlContext.read.json(sparkContext.parallelize(
       """{"nested": {"attribute": 1}, "value": 2}""" :: Nil))
      .registerTempTable("rows")
 
@@ -215,7 +215,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-6201 IN type conversion") {
     sqlContext.read.json(
-      sqlContext.sparkContext.parallelize(
+      sparkContext.parallelize(
         Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}")))
       .registerTempTable("d")
 
@@ -328,6 +328,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       testCodeGen(
         "SELECT min(key) FROM testData3x",
         Row(1) :: Nil)
+      // STDDEV
+      testCodeGen(
+        "SELECT a, stddev(b), stddev_pop(b) FROM testData2 GROUP BY a",
+        (1 to 3).map(i => Row(i, math.sqrt(0.5), math.sqrt(0.25))))
+      testCodeGen(
+        "SELECT stddev(b), stddev_pop(b), stddev_samp(b) FROM testData2",
+        Row(math.sqrt(1.5 / 5), math.sqrt(1.5 / 6), math.sqrt(1.5 / 5)) :: Nil)
       // Some combinations.
       testCodeGen(
         """
@@ -348,8 +355,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         Row(100, 1, 50.5, 300, 100) :: Nil)
       // Aggregate with Code generation handling all null values
       testCodeGen(
-        "SELECT  sum('a'), avg('a'), count(null) FROM testData",
-        Row(null, null, 0) :: Nil)
+        "SELECT  sum('a'), avg('a'), stddev('a'), count(null) FROM testData",
+        Row(null, null, null, 0) :: Nil)
     } finally {
       sqlContext.dropTempTable("testData3x")
       sqlContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue)
@@ -515,8 +522,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("aggregates with nulls") {
     checkAnswer(
-      sql("SELECT MIN(a), MAX(a), AVG(a), SUM(a), COUNT(a) FROM nullInts"),
-      Row(1, 3, 2, 6, 3)
+      sql("SELECT MIN(a), MAX(a), AVG(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
+      Row(1, 3, 2, 1, 6, 3)
     )
   }
 
@@ -574,28 +581,12 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       mapData.collect().sortBy(_.data(1)).reverse.map(Row.fromTuple).toSeq)
   }
 
-  test("sorting") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "false") {
-      sortTest()
-    }
-  }
-
   test("external sorting") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "true") {
-      sortTest()
-    }
-  }
-
-  test("SPARK-6927 sorting with codegen on") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "false",
-      SQLConf.CODEGEN_ENABLED.key -> "true") {
-      sortTest()
-    }
+    sortTest()
   }
 
   test("SPARK-6927 external sorting with codegen on") {
-    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "true",
-      SQLConf.CODEGEN_ENABLED.key -> "true") {
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
       sortTest()
     }
   }
@@ -722,6 +713,33 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("stddev") {
+    checkAnswer(
+      sql("SELECT STDDEV(a) FROM testData2"),
+      Row(math.sqrt(4/5.0))
+    )
+  }
+
+  test("stddev_pop") {
+    checkAnswer(
+      sql("SELECT STDDEV_POP(a) FROM testData2"),
+      Row(math.sqrt(4/6.0))
+    )
+  }
+
+  test("stddev_samp") {
+    checkAnswer(
+      sql("SELECT STDDEV_SAMP(a) FROM testData2"),
+      Row(math.sqrt(4/5.0))
+    )
+  }
+
+  test("stddev agg") {
+    checkAnswer(
+      sql("SELECT a, stddev(b), stddev_pop(b), stddev_samp(b) FROM testData2 GROUP BY a"),
+      (1 to 3).map(i => Row(i, math.sqrt(1/2.0), math.sqrt(1/4.0), math.sqrt(1/2.0))))
+  }
+
   test("inner join where, one match per row") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData JOIN lowerCaseData WHERE n = N"),
@@ -991,21 +1009,30 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     val nonexistentKey = "nonexistent"
 
     // "set" itself returns all config variables currently specified in SQLConf.
-    assert(sql("SET").collect().size == 0)
+    assert(sql("SET").collect().size === TestSQLContext.overrideConfs.size)
+    sql("SET").collect().foreach { row =>
+      val key = row.getString(0)
+      val value = row.getString(1)
+      assert(
+        TestSQLContext.overrideConfs.contains(key),
+        s"$key should exist in SQLConf.")
+      assert(
+        TestSQLContext.overrideConfs(key) === value,
+        s"The value of $key should be ${TestSQLContext.overrideConfs(key)} instead of $value.")
+    }
+    val overrideConfs = sql("SET").collect()
 
     // "set key=val"
     sql(s"SET $testKey=$testVal")
     checkAnswer(
       sql("SET"),
-      Row(testKey, testVal)
+      overrideConfs ++ Seq(Row(testKey, testVal))
     )
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     checkAnswer(
       sql("set"),
-      Seq(
-        Row(testKey, testVal),
-        Row(testKey + testKey, testVal + testVal))
+      overrideConfs ++ Seq(Row(testKey, testVal), Row(testKey + testKey, testVal + testVal))
     )
 
     // "set key"
@@ -1342,7 +1369,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-3483 Special chars in column names") {
-    val data = sqlContext.sparkContext.parallelize(
+    val data = sparkContext.parallelize(
       Seq("""{"key?number1": "value1", "key.number2": "value2"}"""))
     sqlContext.read.json(data).registerTempTable("records")
     sql("SELECT `key?number1`, `key.number2` FROM records")
@@ -1385,13 +1412,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-4322 Grouping field with struct field as sub expression") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil))
+    sqlContext.read.json(sparkContext.makeRDD("""{"a": {"b": [{"c": 1}]}}""" :: Nil))
       .registerTempTable("data")
     checkAnswer(sql("SELECT a.b[0].c FROM data GROUP BY a.b[0].c"), Row(1))
     sqlContext.dropTempTable("data")
 
     sqlContext.read.json(
-      sqlContext.sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
+      sparkContext.makeRDD("""{"a": {"b": 1}}""" :: Nil)).registerTempTable("data")
     checkAnswer(sql("SELECT a.b + 1 FROM data GROUP BY a.b + 1"), Row(2))
     sqlContext.dropTempTable("data")
   }
@@ -1412,10 +1439,10 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("Supporting relational operator '<=>' in Spark SQL") {
     val nullCheckData1 = TestData(1, "1") :: TestData(2, null) :: Nil
-    val rdd1 = sqlContext.sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
+    val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
     rdd1.toDF().registerTempTable("nulldata1")
     val nullCheckData2 = TestData(1, "1") :: TestData(2, null) :: Nil
-    val rdd2 = sqlContext.sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
+    val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
     rdd2.toDF().registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
       "nulldata2 on nulldata1.value <=> nulldata2.value"),
@@ -1424,7 +1451,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("Multi-column COUNT(DISTINCT ...)") {
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
-    val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
+    val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("distinctData")
     checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
   }
@@ -1432,14 +1459,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   test("SPARK-4699 case sensitivity SQL query") {
     sqlContext.setConf(SQLConf.CASE_SENSITIVE, false)
     val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil
-    val rdd = sqlContext.sparkContext.parallelize((0 to 1).map(i => data(i)))
+    val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
     rdd.toDF().registerTempTable("testTable1")
     checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1"))
     sqlContext.setConf(SQLConf.CASE_SENSITIVE, true)
   }
 
   test("SPARK-6145: ORDER BY test for nested fields") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    sqlContext.read.json(sparkContext.makeRDD(
         """{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""" :: Nil))
       .registerTempTable("nestedOrder")
 
@@ -1452,14 +1479,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-6145: special cases") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    sqlContext.read.json(sparkContext.makeRDD(
       """{"a": {"b": [1]}, "b": [{"a": 1}], "_c0": {"a": 1}}""" :: Nil)).registerTempTable("t")
     checkAnswer(sql("SELECT a.b[0] FROM t ORDER BY _c0.a"), Row(1))
     checkAnswer(sql("SELECT b[0].a FROM t ORDER BY _c0.a"), Row(1))
   }
 
   test("SPARK-6898: complete support for special chars in column names") {
-    sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+    sqlContext.read.json(sparkContext.makeRDD(
       """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
       .registerTempTable("t")
 
@@ -1490,6 +1517,16 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         """.stripMargin),
       Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil)
 
+    checkAnswer(
+      sql(
+        """
+          |SELECT sum(b)
+          |FROM orderByData
+          |GROUP BY a
+          |ORDER BY sum(b), max(b)
+        """.stripMargin),
+      Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil)
+
     checkAnswer(
       sql(
         """
@@ -1509,6 +1546,26 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
             |ORDER BY sum(b) + 1
           """.stripMargin),
       Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+            |SELECT count(*)
+            |FROM orderByData
+            |GROUP BY a
+            |ORDER BY count(*)
+          """.stripMargin),
+      Row(2) :: Row(2) :: Row(2) :: Row(2) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+            |SELECT a
+            |FROM orderByData
+            |GROUP BY a
+            |ORDER BY a, count(*), sum(b)
+          """.stripMargin),
+      Row("1") :: Row("2") :: Row("3") :: Row("4") :: Nil)
   }
 
   test("SPARK-7952: fix the equality check between boolean and numeric types") {
@@ -1533,7 +1590,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-7067: order by queries for complex ExtractValue chain") {
     withTempTable("t") {
-      sqlContext.read.json(sqlContext.sparkContext.makeRDD(
+      sqlContext.read.json(sparkContext.makeRDD(
         """{"a": {"b": [{"c": 1}]}, "b": [{"d": 1}]}""" :: Nil)).registerTempTable("t")
       checkAnswer(sql("SELECT a.b FROM t ORDER BY b[0].d"), Row(Seq(Row(1))))
     }
@@ -1600,8 +1657,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
 
   test("aggregation with codegen updates peak execution memory") {
     withSQLConf((SQLConf.CODEGEN_ENABLED.key, "true")) {
-      val sc = sqlContext.sparkContext
-      AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "aggregation with codegen") {
+      AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "aggregation with codegen") {
         testCodeGen(
           "SELECT key, count(value) FROM testData GROUP BY key",
           (1 to 100).map(i => Row(i, 1)))
@@ -1659,17 +1715,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("external sorting updates peak execution memory") {
-    withSQLConf((SQLConf.EXTERNAL_SORT.key, "true")) {
-      val sc = sqlContext.sparkContext
-      AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sort") {
-        sortTest()
-      }
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "external sort") {
+      sortTest()
     }
   }
 
   test("SPARK-9511: error with table starting with number") {
     withTempTable("1one") {
-      sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString))
+      sparkContext.parallelize(1 to 10).map(i => (i, i.toString))
         .toDF("num", "str")
         .registerTempTable("1one")
       checkAnswer(sql("select count(num) from 1one"), Row(10))
@@ -1680,7 +1733,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
       val df =
-        sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
+        sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
       df
         .write
         .format("parquet")
@@ -1712,9 +1765,48 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-10130 type coercion for IF should have children resolved first") {
-    val df = Seq((1, 1), (-1, 1)).toDF("key", "value")
-    df.registerTempTable("src")
-    checkAnswer(
-      sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0)))
+    withTempTable("src") {
+      Seq((1, 1), (-1, 1)).toDF("key", "value").registerTempTable("src")
+      checkAnswer(
+        sql("SELECT IF(a > 0, a, 0) FROM (SELECT key a FROM src) temp"), Seq(Row(1), Row(0)))
+    }
+  }
+
+  test("SPARK-10389: order by non-attribute grouping expression on Aggregate") {
+    withTempTable("src") {
+      Seq((1, 1), (-1, 1)).toDF("key", "value").registerTempTable("src")
+      checkAnswer(sql("SELECT MAX(value) FROM src GROUP BY key + 1 ORDER BY key + 1"),
+        Seq(Row(1), Row(1)))
+      checkAnswer(sql("SELECT MAX(value) FROM src GROUP BY key + 1 ORDER BY (key + 1) * 2"),
+        Seq(Row(1), Row(1)))
+    }
+  }
+
+  test("SortMergeJoin returns wrong results when using UnsafeRows") {
+    // This test is for the fix of https://issues.apache.org/jira/browse/SPARK-10737.
+    // This bug will be triggered when Tungsten is enabled and there are multiple
+    // SortMergeJoin operators executed in the same task.
+    val confs =
+      SQLConf.SORTMERGE_JOIN.key -> "true" ::
+        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1" ::
+        SQLConf.TUNGSTEN_ENABLED.key -> "true" :: Nil
+    withSQLConf(confs: _*) {
+      val df1 = (1 to 50).map(i => (s"str_$i", i)).toDF("i", "j")
+      val df2 =
+        df1
+          .join(df1.select(df1("i")), "i")
+          .select(df1("i"), df1("j"))
+
+      val df3 = df2.withColumnRenamed("i", "i1").withColumnRenamed("j", "j1")
+      val df4 =
+        df2
+          .join(df3, df2("i") === df3("i1"))
+          .withColumn("diff", $"j" - $"j1")
+          .select(df2("i"), df2("j"), $"diff")
+
+      checkAnswer(
+        df4,
+        df1.withColumn("diff", lit(0)))
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
index 45d0ee4a8e749..ddab918629645 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.test.SharedSQLContext
 class SerializationSuite extends SparkFunSuite with SharedSQLContext {
 
   test("[SPARK-5235] SQLContext should be serializable") {
-    val _sqlContext = new SQLContext(sqlContext.sparkContext)
+    val _sqlContext = new SQLContext(sparkContext)
     new JavaSerializer(new SparkConf()).newInstance().serialize(_sqlContext)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index b91438baea06f..e12e6bea30260 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -268,9 +268,7 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
       Row(3, 4))
 
     intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("length(c)"), // int type of the argument is unacceptable
-        Row("5.0000"))
+      df.selectExpr("length(c)") // int type of the argument is unacceptable
     }
   }
 
@@ -284,63 +282,46 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("number format function") {
-    val tuple =
-      ("aa", 1.asInstanceOf[Byte], 2.asInstanceOf[Short],
-        3.13223f, 4, 5L, 6.48173d, Decimal(7.128381))
-    val df =
-      Seq(tuple)
-        .toDF(
-          "a", // string "aa"
-          "b", // byte    1
-          "c", // short   2
-          "d", // float   3.13223f
-          "e", // integer 4
-          "f", // long    5L
-          "g", // double  6.48173d
-          "h") // decimal 7.128381
-
-    checkAnswer(
-      df.select(format_number($"f", 4)),
+    val df = sqlContext.range(1)
+
+    checkAnswer(
+      df.select(format_number(lit(5L), 4)),
       Row("5.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(b, e)"), // convert the 1st argument to integer
+      df.select(format_number(lit(1.toByte), 4)), // convert the 1st argument to integer
       Row("1.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(c, e)"), // convert the 1st argument to integer
+      df.select(format_number(lit(2.toShort), 4)), // convert the 1st argument to integer
       Row("2.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(d, e)"), // convert the 1st argument to double
+      df.select(format_number(lit(3.1322.toFloat), 4)), // convert the 1st argument to double
       Row("3.1322"))
 
     checkAnswer(
-      df.selectExpr("format_number(e, e)"), // not convert anything
+      df.select(format_number(lit(4), 4)), // not convert anything
       Row("4.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(f, e)"), // not convert anything
+      df.select(format_number(lit(5L), 4)), // not convert anything
       Row("5.0000"))
 
     checkAnswer(
-      df.selectExpr("format_number(g, e)"), // not convert anything
+      df.select(format_number(lit(6.48173), 4)), // not convert anything
       Row("6.4817"))
 
     checkAnswer(
-      df.selectExpr("format_number(h, e)"), // not convert anything
+      df.select(format_number(lit(BigDecimal(7.128381)), 4)), // not convert anything
       Row("7.1284"))
 
     intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("format_number(a, e)"), // string type of the 1st argument is unacceptable
-        Row("5.0000"))
+      df.select(format_number(lit("aa"), 4)) // string type of the 1st argument is unacceptable
     }
 
     intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("format_number(e, g)"), // decimal type of the 2nd argument is unacceptable
-        Row("5.0000"))
+      df.selectExpr("format_number(4, 6.48173)") // non-integral type 2nd argument is unacceptable
     }
 
     // for testing the mutable state of the expression in code gen.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index eb275af101e2f..e0435a0dba6ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -26,7 +26,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   test("built-in fixed arity expressions") {
-    val df = ctx.emptyDataFrame
+    val df = sqlContext.emptyDataFrame
     df.selectExpr("rand()", "randn()", "rand(5)", "randn(50)")
   }
 
@@ -55,23 +55,23 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val df = Seq((1, "Tearing down the walls that divide us")).toDF("id", "saying")
     df.registerTempTable("tmp_table")
     checkAnswer(sql("select spark_partition_id() from tmp_table").toDF(), Row(0))
-    ctx.dropTempTable("tmp_table")
+    sqlContext.dropTempTable("tmp_table")
   }
 
   test("SPARK-8005 input_file_name") {
     withTempPath { dir =>
-      val data = ctx.sparkContext.parallelize(0 to 10, 2).toDF("id")
+      val data = sparkContext.parallelize(0 to 10, 2).toDF("id")
       data.write.parquet(dir.getCanonicalPath)
-      ctx.read.parquet(dir.getCanonicalPath).registerTempTable("test_table")
+      sqlContext.read.parquet(dir.getCanonicalPath).registerTempTable("test_table")
       val answer = sql("select input_file_name() from test_table").head().getString(0)
       assert(answer.contains(dir.getCanonicalPath))
       assert(sql("select input_file_name() from test_table").distinct().collect().length >= 2)
-      ctx.dropTempTable("test_table")
+      sqlContext.dropTempTable("test_table")
     }
   }
 
   test("error reporting for incorrect number of arguments") {
-    val df = ctx.emptyDataFrame
+    val df = sqlContext.emptyDataFrame
     val e = intercept[AnalysisException] {
       df.selectExpr("substr('abcd', 2, 3, 4)")
     }
@@ -79,7 +79,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("error reporting for undefined functions") {
-    val df = ctx.emptyDataFrame
+    val df = sqlContext.emptyDataFrame
     val e = intercept[AnalysisException] {
       df.selectExpr("a_function_that_does_not_exist()")
     }
@@ -87,24 +87,24 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("Simple UDF") {
-    ctx.udf.register("strLenScala", (_: String).length)
+    sqlContext.udf.register("strLenScala", (_: String).length)
     assert(sql("SELECT strLenScala('test')").head().getInt(0) === 4)
   }
 
   test("ZeroArgument UDF") {
-    ctx.udf.register("random0", () => { Math.random()})
+    sqlContext.udf.register("random0", () => { Math.random()})
     assert(sql("SELECT random0()").head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {
-    ctx.udf.register("strLenScala", (_: String).length + (_: Int))
+    sqlContext.udf.register("strLenScala", (_: String).length + (_: Int))
     assert(sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5)
   }
 
   test("UDF in a WHERE") {
-    ctx.udf.register("oneArgFilter", (n: Int) => { n > 80 })
+    sqlContext.udf.register("oneArgFilter", (n: Int) => { n > 80 })
 
-    val df = ctx.sparkContext.parallelize(
+    val df = sparkContext.parallelize(
       (1 to 100).map(i => TestData(i, i.toString))).toDF()
     df.registerTempTable("integerData")
 
@@ -114,7 +114,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDF in a HAVING") {
-    ctx.udf.register("havingFilter", (n: Long) => { n > 5 })
+    sqlContext.udf.register("havingFilter", (n: Long) => { n > 5 })
 
     val df = Seq(("red", 1), ("red", 2), ("blue", 10),
       ("green", 100), ("green", 200)).toDF("g", "v")
@@ -133,7 +133,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDF in a GROUP BY") {
-    ctx.udf.register("groupFunction", (n: Int) => { n > 10 })
+    sqlContext.udf.register("groupFunction", (n: Int) => { n > 10 })
 
     val df = Seq(("red", 1), ("red", 2), ("blue", 10),
       ("green", 100), ("green", 200)).toDF("g", "v")
@@ -150,10 +150,10 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDFs everywhere") {
-    ctx.udf.register("groupFunction", (n: Int) => { n > 10 })
-    ctx.udf.register("havingFilter", (n: Long) => { n > 2000 })
-    ctx.udf.register("whereFilter", (n: Int) => { n < 150 })
-    ctx.udf.register("timesHundred", (n: Long) => { n * 100 })
+    sqlContext.udf.register("groupFunction", (n: Int) => { n > 10 })
+    sqlContext.udf.register("havingFilter", (n: Long) => { n > 2000 })
+    sqlContext.udf.register("whereFilter", (n: Int) => { n < 150 })
+    sqlContext.udf.register("timesHundred", (n: Long) => { n * 100 })
 
     val df = Seq(("red", 1), ("red", 2), ("blue", 10),
       ("green", 100), ("green", 200)).toDF("g", "v")
@@ -172,7 +172,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("struct UDF") {
-    ctx.udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
+    sqlContext.udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2))
 
     val result =
       sql("SELECT returnStruct('test', 'test2') as ret")
@@ -181,13 +181,13 @@ class UDFSuite extends QueryTest with SharedSQLContext {
   }
 
   test("udf that is transformed") {
-    ctx.udf.register("makeStruct", (x: Int, y: Int) => (x, y))
+    sqlContext.udf.register("makeStruct", (x: Int, y: Int) => (x, y))
     // 1 + 1 is constant folded causing a transformation.
     assert(sql("SELECT makeStruct(1 + 1, 2)").first().getAs[Row](0) === Row(2, 2))
   }
 
   test("type coercion for udf inputs") {
-    ctx.udf.register("intExpected", (x: Int) => x)
+    sqlContext.udf.register("intExpected", (x: Int) => x)
     // pass a decimal to intExpected.
     assert(sql("SELECT intExpected(1.0)").head().getInt(0) === 1)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 219435dff5bc8..2476b10e3cf9e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -43,12 +43,12 @@ class UnsafeRowSuite extends SparkFunSuite {
     val arrayBackedUnsafeRow: UnsafeRow =
       UnsafeProjection.create(Array[DataType](StringType, StringType, IntegerType)).apply(row)
     assert(arrayBackedUnsafeRow.getBaseObject.isInstanceOf[Array[Byte]])
-    val bytesFromArrayBackedRow: Array[Byte] = {
+    val (bytesFromArrayBackedRow, field0StringFromArrayBackedRow): (Array[Byte], String) = {
       val baos = new ByteArrayOutputStream()
       arrayBackedUnsafeRow.writeToStream(baos, null)
-      baos.toByteArray
+      (baos.toByteArray, arrayBackedUnsafeRow.getString(0))
     }
-    val bytesFromOffheapRow: Array[Byte] = {
+    val (bytesFromOffheapRow, field0StringFromOffheapRow): (Array[Byte], String) = {
       val offheapRowPage = MemoryAllocator.UNSAFE.allocate(arrayBackedUnsafeRow.getSizeInBytes)
       try {
         Platform.copyMemory(
@@ -69,13 +69,14 @@ class UnsafeRowSuite extends SparkFunSuite {
         val baos = new ByteArrayOutputStream()
         val writeBuffer = new Array[Byte](1024)
         offheapUnsafeRow.writeToStream(baos, writeBuffer)
-        baos.toByteArray
+        (baos.toByteArray, offheapUnsafeRow.getString(0))
       } finally {
         MemoryAllocator.UNSAFE.free(offheapRowPage)
       }
     }
 
     assert(bytesFromArrayBackedRow === bytesFromOffheapRow)
+    assert(field0StringFromArrayBackedRow === field0StringFromOffheapRow)
   }
 
   test("calling getDouble() and getFloat() on null columns") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index b6d279ae47268..7992fd59ff4ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -22,6 +22,7 @@ import scala.beans.{BeanInfo, BeanProperty}
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{OpenHashSetUDT, HyperLogLogUDT}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
@@ -90,7 +91,7 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
   }
 
   test("UDTs and UDFs") {
-    ctx.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
+    sqlContext.udf.register("testType", (d: MyDenseVector) => d.isInstanceOf[MyDenseVector])
     pointsRDD.registerTempTable("points")
     checkAnswer(
       sql("SELECT testType(features) from points"),
@@ -148,8 +149,8 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
       StructField("vec", new MyDenseVectorUDT, false)
     ))
 
-    val stringRDD = ctx.sparkContext.parallelize(data)
-    val jsonRDD = ctx.read.schema(schema).json(stringRDD)
+    val stringRDD = sparkContext.parallelize(data)
+    val jsonRDD = sqlContext.read.schema(schema).json(stringRDD)
     checkAnswer(
       jsonRDD,
       Row(1, new MyDenseVector(Array(1.1, 2.2, 3.3, 4.4))) ::
@@ -157,4 +158,20 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
         Nil
     )
   }
+
+  test("SPARK-10472 UserDefinedType.typeName") {
+    assert(IntegerType.typeName === "integer")
+    assert(new MyDenseVectorUDT().typeName === "mydensevector")
+    assert(new OpenHashSetUDT(IntegerType).typeName === "openhashset")
+  }
+
+  test("Catalyst type converter null handling for UDTs") {
+    val udt = new MyDenseVectorUDT()
+    val toScalaConverter = CatalystTypeConverters.createToScalaConverter(udt)
+    assert(toScalaConverter(null) === null)
+
+    val toCatalystConverter = CatalystTypeConverters.createToCatalystConverter(udt)
+    assert(toCatalystConverter(null) === null)
+
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 952637c5f9cb8..ea5dd2be33d63 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -31,7 +31,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   setupTestData()
 
   test("simple columnar query") {
-    val plan = ctx.executePlan(testData.logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -39,16 +39,16 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
   test("default size avoids broadcast") {
     // TODO: Improve this test when we have better statistics
-    ctx.sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
+    sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
       .toDF().registerTempTable("sizeTst")
-    ctx.cacheTable("sizeTst")
+    sqlContext.cacheTable("sizeTst")
     assert(
-      ctx.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
-        ctx.conf.autoBroadcastJoinThreshold)
+      sqlContext.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
+        sqlContext.conf.autoBroadcastJoinThreshold)
   }
 
   test("projection") {
-    val plan = ctx.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.select('value, 'key).logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().map {
@@ -57,7 +57,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
-    val plan = ctx.executePlan(testData.logicalPlan).executedPlan
+    val plan = sqlContext.executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
 
     checkAnswer(scan, testData.collect().toSeq)
@@ -69,7 +69,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT * FROM repeatedData"),
       repeatedData.collect().toSeq.map(Row.fromTuple))
 
-    ctx.cacheTable("repeatedData")
+    sqlContext.cacheTable("repeatedData")
 
     checkAnswer(
       sql("SELECT * FROM repeatedData"),
@@ -81,7 +81,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT * FROM nullableRepeatedData"),
       nullableRepeatedData.collect().toSeq.map(Row.fromTuple))
 
-    ctx.cacheTable("nullableRepeatedData")
+    sqlContext.cacheTable("nullableRepeatedData")
 
     checkAnswer(
       sql("SELECT * FROM nullableRepeatedData"),
@@ -96,7 +96,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT time FROM timestamps"),
       timestamps.collect().toSeq)
 
-    ctx.cacheTable("timestamps")
+    sqlContext.cacheTable("timestamps")
 
     checkAnswer(
       sql("SELECT time FROM timestamps"),
@@ -108,7 +108,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       sql("SELECT * FROM withEmptyParts"),
       withEmptyParts.collect().toSeq.map(Row.fromTuple))
 
-    ctx.cacheTable("withEmptyParts")
+    sqlContext.cacheTable("withEmptyParts")
 
     checkAnswer(
       sql("SELECT * FROM withEmptyParts"),
@@ -157,7 +157,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
     // Create a RDD for the schema
     val rdd =
-      ctx.sparkContext.parallelize((1 to 100), 10).map { i =>
+      sparkContext.parallelize((1 to 100), 10).map { i =>
         Row(
           s"str${i}: test cache.",
           s"binary${i}: test cache.".getBytes("UTF-8"),
@@ -177,18 +177,46 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
           (0 to i).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
           Row((i - 0.25).toFloat, Seq(true, false, null)))
       }
-    ctx.createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
+    sqlContext.createDataFrame(rdd, schema).registerTempTable("InMemoryCache_different_data_types")
     // Cache the table.
     sql("cache table InMemoryCache_different_data_types")
     // Make sure the table is indeed cached.
-    val tableScan = ctx.table("InMemoryCache_different_data_types").queryExecution.executedPlan
+    sqlContext.table("InMemoryCache_different_data_types").queryExecution.executedPlan
     assert(
-      ctx.isCached("InMemoryCache_different_data_types"),
+      sqlContext.isCached("InMemoryCache_different_data_types"),
       "InMemoryCache_different_data_types should be cached.")
     // Issue a query and check the results.
     checkAnswer(
       sql(s"SELECT DISTINCT ${allColumns} FROM InMemoryCache_different_data_types"),
-      ctx.table("InMemoryCache_different_data_types").collect())
-    ctx.dropTempTable("InMemoryCache_different_data_types")
+      sqlContext.table("InMemoryCache_different_data_types").collect())
+    sqlContext.dropTempTable("InMemoryCache_different_data_types")
+  }
+
+  test("SPARK-10422: String column in InMemoryColumnarCache needs to override clone method") {
+    val df = sqlContext.range(1, 100).selectExpr("id % 10 as id")
+      .rdd.map(id => Tuple1(s"str_$id")).toDF("i")
+    val cached = df.cache()
+    // count triggers the caching action. It should not throw.
+    cached.count()
+
+    // Make sure, the DataFrame is indeed cached.
+    assert(sqlContext.cacheManager.lookupCachedData(cached).nonEmpty)
+
+    // Check result.
+    checkAnswer(
+      cached,
+      sqlContext.range(1, 100).selectExpr("id % 10 as id")
+        .rdd.map(id => Tuple1(s"str_$id")).toDF("i")
+    )
+
+    // Drop the cache.
+    cached.unpersist()
+  }
+
+  test("SPARK-10859: Predicates pushed to InMemoryColumnarTableScan are not evaluated correctly") {
+    val data = sqlContext.range(10).selectExpr("id", "cast(id as string) as s")
+    data.cache()
+    assert(data.count() === 10)
+    assert(data.filter($"s" === "3").count() === 1)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index ab2644eb4581d..6b7401464f46f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -25,32 +25,32 @@ import org.apache.spark.sql.test.SQLTestData._
 class PartitionBatchPruningSuite extends SparkFunSuite with SharedSQLContext {
   import testImplicits._
 
-  private lazy val originalColumnBatchSize = ctx.conf.columnBatchSize
-  private lazy val originalInMemoryPartitionPruning = ctx.conf.inMemoryPartitionPruning
+  private lazy val originalColumnBatchSize = sqlContext.conf.columnBatchSize
+  private lazy val originalInMemoryPartitionPruning = sqlContext.conf.inMemoryPartitionPruning
 
   override protected def beforeAll(): Unit = {
     super.beforeAll()
     // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch
-    ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, 10)
+    sqlContext.setConf(SQLConf.COLUMN_BATCH_SIZE, 10)
 
-    val pruningData = ctx.sparkContext.makeRDD((1 to 100).map { key =>
+    val pruningData = sparkContext.makeRDD((1 to 100).map { key =>
       val string = if (((key - 1) / 10) % 2 == 0) null else key.toString
       TestData(key, string)
     }, 5).toDF()
     pruningData.registerTempTable("pruningData")
 
     // Enable in-memory partition pruning
-    ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
+    sqlContext.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
     // Enable in-memory table scan accumulators
-    ctx.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
-    ctx.cacheTable("pruningData")
+    sqlContext.setConf("spark.sql.inMemoryTableScanStatistics.enable", "true")
+    sqlContext.cacheTable("pruningData")
   }
 
   override protected def afterAll(): Unit = {
     try {
-      ctx.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
-      ctx.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
-      ctx.uncacheTable("pruningData")
+      sqlContext.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
+      sqlContext.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
+      sqlContext.uncacheTable("pruningData")
     } finally {
       super.afterAll()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
index 8998f5111124c..911d12e93e503 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
@@ -22,6 +22,8 @@ import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
+
   test("shuffling UnsafeRows in exchange") {
     val input = (1 to 1000).map(Tuple1.apply)
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index fad93b014c237..cafa1d5154788 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{execution, Row, SQLConf}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -31,14 +30,14 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
 
-class PlannerSuite extends SparkFunSuite with SharedSQLContext {
+class PlannerSuite extends SharedSQLContext {
   import testImplicits._
 
   setupTestData()
 
   private def testPartialAggregationPlan(query: LogicalPlan): Unit = {
-    val _ctx = ctx
-    import _ctx.planner._
+    val planner = sqlContext.planner
+    import planner._
     val plannedOption = HashAggregation(query).headOption.orElse(Aggregation(query).headOption)
     val planned =
       plannedOption.getOrElse(
@@ -53,8 +52,8 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("unions are collapsed") {
-    val _ctx = ctx
-    import _ctx.planner._
+    val planner = sqlContext.planner
+    import planner._
     val query = testData.unionAll(testData).unionAll(testData).logicalPlan
     val planned = BasicOperators(query).head
     val logicalUnions = query collect { case u: logical.Union => u }
@@ -81,33 +80,30 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("sizeInBytes estimation of limit operator for broadcast hash join optimization") {
-    def checkPlan(fieldTypes: Seq[DataType], newThreshold: Int): Unit = {
-      ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, newThreshold)
-      val fields = fieldTypes.zipWithIndex.map {
-        case (dataType, index) => StructField(s"c${index}", dataType, true)
-      } :+ StructField("key", IntegerType, true)
-      val schema = StructType(fields)
-      val row = Row.fromSeq(Seq.fill(fields.size)(null))
-      val rowRDD = ctx.sparkContext.parallelize(row :: Nil)
-      ctx.createDataFrame(rowRDD, schema).registerTempTable("testLimit")
-
-      val planned = sql(
-        """
-          |SELECT l.a, l.b
-          |FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
-        """.stripMargin).queryExecution.executedPlan
-
-      val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
-      val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
-
-      assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
-      assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
-
-      ctx.dropTempTable("testLimit")
+    def checkPlan(fieldTypes: Seq[DataType]): Unit = {
+      withTempTable("testLimit") {
+        val fields = fieldTypes.zipWithIndex.map {
+          case (dataType, index) => StructField(s"c${index}", dataType, true)
+        } :+ StructField("key", IntegerType, true)
+        val schema = StructType(fields)
+        val row = Row.fromSeq(Seq.fill(fields.size)(null))
+        val rowRDD = sparkContext.parallelize(row :: Nil)
+        sqlContext.createDataFrame(rowRDD, schema).registerTempTable("testLimit")
+
+        val planned = sql(
+          """
+            |SELECT l.a, l.b
+            |FROM testData2 l JOIN (SELECT * FROM testLimit LIMIT 1) r ON (l.a = r.key)
+          """.stripMargin).queryExecution.executedPlan
+
+        val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
+        val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
+
+        assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
+        assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
+      }
     }
 
-    val origThreshold = ctx.conf.autoBroadcastJoinThreshold
-
     val simpleTypes =
       NullType ::
       BooleanType ::
@@ -124,7 +120,9 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
       StringType ::
       BinaryType :: Nil
 
-    checkPlan(simpleTypes, newThreshold = 16434)
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "16434") {
+      checkPlan(simpleTypes)
+    }
 
     val complexTypes =
       ArrayType(DoubleType, true) ::
@@ -136,36 +134,37 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
         StructField("b", ArrayType(DoubleType), nullable = false),
         StructField("c", DoubleType, nullable = false))) :: Nil
 
-    checkPlan(complexTypes, newThreshold = 901617)
-
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold)
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "901617") {
+      checkPlan(complexTypes)
+    }
   }
 
   test("InMemoryRelation statistics propagation") {
-    val origThreshold = ctx.conf.autoBroadcastJoinThreshold
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, 81920)
-
-    testData.limit(3).registerTempTable("tiny")
-    sql("CACHE TABLE tiny")
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") {
+      withTempTable("tiny") {
+        testData.limit(3).registerTempTable("tiny")
+        sql("CACHE TABLE tiny")
 
-    val a = testData.as("a")
-    val b = ctx.table("tiny").as("b")
-    val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan
+        val a = testData.as("a")
+        val b = sqlContext.table("tiny").as("b")
+        val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan
 
-    val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
-    val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
+        val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join }
+        val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join }
 
-    assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
-    assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
+        assert(broadcastHashJoins.size === 1, "Should use broadcast hash join")
+        assert(shuffledHashJoins.isEmpty, "Should not use shuffled hash join")
 
-    ctx.setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, origThreshold)
+        sqlContext.clearCache()
+      }
+    }
   }
 
   test("efficient limit -> project -> sort") {
     {
       val query =
         testData.select('key, 'value).sort('key).limit(2).logicalPlan
-      val planned = ctx.planner.TakeOrderedAndProject(query)
+      val planned = sqlContext.planner.TakeOrderedAndProject(query)
       assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject])
       assert(planned.head.output === testData.select('key, 'value).logicalPlan.output)
     }
@@ -175,7 +174,7 @@ class PlannerSuite extends SparkFunSuite with SharedSQLContext {
       // into it.
       val query =
         testData.select('key, 'value).sort('key).select('value, 'key).limit(2).logicalPlan
-      val planned = ctx.planner.TakeOrderedAndProject(query)
+      val planned = sqlContext.planner.TakeOrderedAndProject(query)
       assert(planned.head.isInstanceOf[execution.TakeOrderedAndProject])
       assert(planned.head.output === testData.select('value, 'key).logicalPlan.output)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index ef6ad59b71fb3..5dc37e5c3c238 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -32,27 +32,27 @@ class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext {
     case c: ConvertToSafe => c
   }
 
-  private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null, "name"))
+  private val outputsSafe = Sort(Nil, false, PhysicalRDD(Seq.empty, null, "name"))
   assert(!outputsSafe.outputsUnsafeRows)
   private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null, "name"))
   assert(outputsUnsafe.outputsUnsafeRows)
 
   test("planner should insert unsafe->safe conversions when required") {
     val plan = Limit(10, outputsUnsafe)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(preparedPlan.children.head.isInstanceOf[ConvertToSafe])
   }
 
   test("filter can process unsafe rows") {
     val plan = Filter(IsNull(IsNull(Literal(1))), outputsUnsafe)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(getConverters(preparedPlan).size === 1)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("filter can process safe rows") {
     val plan = Filter(IsNull(IsNull(Literal(1))), outputsSafe)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(getConverters(preparedPlan).isEmpty)
     assert(!preparedPlan.outputsUnsafeRows)
   }
@@ -67,33 +67,33 @@ class RowFormatConvertersSuite extends SparkPlanTest with SharedSQLContext {
   test("union requires all of its input rows' formats to agree") {
     val plan = Union(Seq(outputsSafe, outputsUnsafe))
     assert(plan.canProcessSafeRows && plan.canProcessUnsafeRows)
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("union can process safe rows") {
     val plan = Union(Seq(outputsSafe, outputsSafe))
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(!preparedPlan.outputsUnsafeRows)
   }
 
   test("union can process unsafe rows") {
     val plan = Union(Seq(outputsUnsafe, outputsUnsafe))
-    val preparedPlan = ctx.prepareForExecution.execute(plan)
+    val preparedPlan = sqlContext.prepareForExecution.execute(plan)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("round trip with ConvertToUnsafe and ConvertToSafe") {
     val input = Seq(("hello", 1), ("world", 2))
     checkAnswer(
-      ctx.createDataFrame(input),
+      sqlContext.createDataFrame(input),
       plan => ConvertToSafe(ConvertToUnsafe(plan)),
       input.map(Row.fromTuple)
     )
   }
 
   test("SPARK-9683: copy UTF8String when convert unsafe array/map to safe") {
-    SparkPlan.currentContext.set(ctx)
+    SparkPlan.currentContext.set(sqlContext)
     val schema = ArrayType(StringType)
     val rows = (1 to 100).map { i =>
       InternalRow(new GenericArrayData(Array[Any](UTF8String.fromString(i.toString))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
new file mode 100644
index 0000000000000..63639681ef80a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.util.Properties
+
+import scala.collection.parallel.CompositeThrowable
+
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.sql.SQLContext
+
+class SQLExecutionSuite extends SparkFunSuite {
+
+  test("concurrent query execution (SPARK-10548)") {
+    // Try to reproduce the issue with the old SparkContext
+    val conf = new SparkConf()
+      .setMaster("local[*]")
+      .setAppName("test")
+    val badSparkContext = new BadSparkContext(conf)
+    try {
+      testConcurrentQueryExecution(badSparkContext)
+      fail("unable to reproduce SPARK-10548")
+    } catch {
+      case e: IllegalArgumentException =>
+        assert(e.getMessage.contains(SQLExecution.EXECUTION_ID_KEY))
+    } finally {
+      badSparkContext.stop()
+    }
+
+    // Verify that the issue is fixed with the latest SparkContext
+    val goodSparkContext = new SparkContext(conf)
+    try {
+      testConcurrentQueryExecution(goodSparkContext)
+    } finally {
+      goodSparkContext.stop()
+    }
+  }
+
+  /**
+   * Trigger SPARK-10548 by mocking a parent and its child thread executing queries concurrently.
+   */
+  private def testConcurrentQueryExecution(sc: SparkContext): Unit = {
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+
+    // Initialize local properties. This is necessary for the test to pass.
+    sc.getLocalProperties
+
+    // Set up a thread that runs executes a simple SQL query.
+    // Before starting the thread, mutate the execution ID in the parent.
+    // The child thread should not see the effect of this change.
+    var throwable: Option[Throwable] = None
+    val child = new Thread {
+      override def run(): Unit = {
+        try {
+          sc.parallelize(1 to 100).map { i => (i, i) }.toDF("a", "b").collect()
+        } catch {
+          case t: Throwable =>
+            throwable = Some(t)
+        }
+
+      }
+    }
+    sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, "anything")
+    child.start()
+    child.join()
+
+    // The throwable is thrown from the child thread so it doesn't have a helpful stack trace
+    throwable.foreach { t =>
+      t.setStackTrace(t.getStackTrace ++ Thread.currentThread.getStackTrace)
+      throw t
+    }
+  }
+
+}
+
+/**
+ * A bad [[SparkContext]] that does not clone the inheritable thread local properties
+ * when passing them to children threads.
+ */
+private class BadSparkContext(conf: SparkConf) extends SparkContext(conf) {
+  protected[spark] override val localProperties = new InheritableThreadLocal[Properties] {
+    override protected def childValue(parent: Properties): Properties = new Properties(parent)
+    override protected def initialValue(): Properties = new Properties()
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
index 8fa77b0fcb7b7..847c188a30333 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.test.SharedSQLContext
 
 class SortSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
 
   // This test was originally added as an example of how to use [[SparkPlanTest]];
   // it's not designed to be a comprehensive test of ExternalSort.
@@ -35,13 +36,13 @@ class SortSuite extends SparkPlanTest with SharedSQLContext {
 
     checkAnswer(
       input.toDF("a", "b", "c"),
-      ExternalSort('a.asc :: 'b.asc :: Nil, global = true, _: SparkPlan),
+      Sort('a.asc :: 'b.asc :: Nil, global = true, _: SparkPlan),
       input.sortBy(t => (t._1, t._2)).map(Row.fromTuple),
       sortAnswers = false)
 
     checkAnswer(
       input.toDF("a", "b", "c"),
-      ExternalSort('b.asc :: 'a.asc :: Nil, global = true, _: SparkPlan),
+      Sort('b.asc :: 'a.asc :: Nil, global = true, _: SparkPlan),
       input.sortBy(t => (t._2, t._1)).map(Row.fromTuple),
       sortAnswers = false)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 3a87f374d94b0..8549a6a0f6643 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -24,21 +24,14 @@ import scala.util.control.NonFatal
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{DataFrame, DataFrameHolder, Row, SQLContext}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.test.SQLTestUtils
 
 /**
  * Base class for writing tests for individual physical operators. For an example of how this
  * class's test helper methods can be used, see [[SortSuite]].
  */
 private[sql] abstract class SparkPlanTest extends SparkFunSuite {
-  protected def _sqlContext: SQLContext
-
-  /**
-   * Creates a DataFrame from a local Seq of Product.
-   */
-  implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder = {
-    _sqlContext.implicits.localSeqToDataFrameHolder(data)
-  }
+  protected def sqlContext: SQLContext
 
   /**
    * Runs the plan and makes sure the answer matches the expected result.
@@ -98,7 +91,7 @@ private[sql] abstract class SparkPlanTest extends SparkFunSuite {
       planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row],
       sortAnswers: Boolean = true): Unit = {
-    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, _sqlContext) match {
+    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, sqlContext) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
@@ -122,7 +115,7 @@ private[sql] abstract class SparkPlanTest extends SparkFunSuite {
       expectedPlanFunction: SparkPlan => SparkPlan,
       sortAnswers: Boolean = true): Unit = {
     SparkPlanTest.checkAnswer(
-        input, planFunction, expectedPlanFunction, sortAnswers, _sqlContext) match {
+        input, planFunction, expectedPlanFunction, sortAnswers, sqlContext) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
@@ -149,13 +142,13 @@ object SparkPlanTest {
       planFunction: SparkPlan => SparkPlan,
       expectedPlanFunction: SparkPlan => SparkPlan,
       sortAnswers: Boolean,
-      _sqlContext: SQLContext): Option[String] = {
+      sqlContext: SQLContext): Option[String] = {
 
     val outputPlan = planFunction(input.queryExecution.sparkPlan)
     val expectedOutputPlan = expectedPlanFunction(input.queryExecution.sparkPlan)
 
     val expectedAnswer: Seq[Row] = try {
-      executePlan(expectedOutputPlan, _sqlContext)
+      executePlan(expectedOutputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -170,7 +163,7 @@ object SparkPlanTest {
     }
 
     val actualAnswer: Seq[Row] = try {
-      executePlan(outputPlan, _sqlContext)
+      executePlan(outputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -184,7 +177,7 @@ object SparkPlanTest {
         return Some(errorMessage)
     }
 
-    compareAnswers(actualAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
+    SQLTestUtils.compareAnswers(actualAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
       s"""
          | Results do not match.
          | Actual result Spark plan:
@@ -210,12 +203,12 @@ object SparkPlanTest {
       planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row],
       sortAnswers: Boolean,
-      _sqlContext: SQLContext): Option[String] = {
+      sqlContext: SQLContext): Option[String] = {
 
     val outputPlan = planFunction(input.map(_.queryExecution.sparkPlan))
 
     val sparkAnswer: Seq[Row] = try {
-      executePlan(outputPlan, _sqlContext)
+      executePlan(outputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -229,7 +222,7 @@ object SparkPlanTest {
         return Some(errorMessage)
     }
 
-    compareAnswers(sparkAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
+    SQLTestUtils.compareAnswers(sparkAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
       s"""
          | Results do not match for Spark plan:
          | $outputPlan
@@ -238,61 +231,21 @@ object SparkPlanTest {
     }
   }
 
-  private def compareAnswers(
-      sparkAnswer: Seq[Row],
-      expectedAnswer: Seq[Row],
-      sort: Boolean): Option[String] = {
-    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
-      // Converts data to types that we can do equality comparison using Scala collections.
-      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
-      // Java's java.math.BigDecimal.compareTo).
-      // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
-      // equality test.
-      // This function is copied from Catalyst's QueryTest
-      val converted: Seq[Row] = answer.map { s =>
-        Row.fromSeq(s.toSeq.map {
-          case d: java.math.BigDecimal => BigDecimal(d)
-          case b: Array[Byte] => b.toSeq
-          case o => o
-        })
-      }
-      if (sort) {
-        converted.sortBy(_.toString())
-      } else {
-        converted
-      }
-    }
-    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
-      val errorMessage =
-        s"""
-           | == Results ==
-           | ${sideBySide(
-              s"== Expected Answer - ${expectedAnswer.size} ==" +:
-              prepareAnswer(expectedAnswer).map(_.toString()),
-              s"== Actual Answer - ${sparkAnswer.size} ==" +:
-              prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
-      """.stripMargin
-      Some(errorMessage)
-    } else {
-      None
-    }
-  }
-
-  private def executePlan(outputPlan: SparkPlan, _sqlContext: SQLContext): Seq[Row] = {
+  private def executePlan(outputPlan: SparkPlan, sqlContext: SQLContext): Seq[Row] = {
     // A very simple resolver to make writing tests easier. In contrast to the real resolver
     // this is always case sensitive and does not try to handle scoping or complex type resolution.
-    val resolvedPlan = _sqlContext.prepareForExecution.execute(
+    val resolvedPlan = sqlContext.prepareForExecution.execute(
       outputPlan transform {
         case plan: SparkPlan =>
           val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
-          plan.transformExpressions {
+          plan transformExpressions {
             case UnresolvedAttribute(Seq(u)) =>
               inputMap.getOrElse(u,
                 sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
           }
       }
     )
-    resolvedPlan.executeCollect().toSeq
+    resolvedPlan.executeCollectPublic().toSeq
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
index 3158458edb831..7a0f0dfd2b7f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
@@ -29,15 +29,16 @@ import org.apache.spark.sql.types._
  * A test suite that generates randomized data to test the [[TungstenSort]] operator.
  */
 class TungstenSortSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    ctx.conf.setConf(SQLConf.CODEGEN_ENABLED, true)
+    sqlContext.conf.setConf(SQLConf.CODEGEN_ENABLED, true)
   }
 
   override def afterAll(): Unit = {
     try {
-      ctx.conf.setConf(SQLConf.CODEGEN_ENABLED, SQLConf.CODEGEN_ENABLED.defaultValue.get)
+      sqlContext.conf.unsetConf(SQLConf.CODEGEN_ENABLED)
     } finally {
       super.afterAll()
     }
@@ -64,8 +65,7 @@ class TungstenSortSuite extends SparkPlanTest with SharedSQLContext {
   }
 
   test("sorting updates peak execution memory") {
-    val sc = ctx.sparkContext
-    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "unsafe external sort") {
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sparkContext, "unsafe external sort") {
       checkThatPlansAgree(
         (1 to 100).map(v => Tuple1(v)).toDF("a"),
         (child: SparkPlan) => TungstenSort('a.asc :: Nil, true, child),
@@ -83,8 +83,8 @@ class TungstenSortSuite extends SparkPlanTest with SharedSQLContext {
   ) {
     test(s"sorting on $dataType with nullable=$nullable, sortOrder=$sortOrder") {
       val inputData = Seq.fill(1000)(randomDataGenerator())
-      val inputDf = ctx.createDataFrame(
-        ctx.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
+      val inputDf = sqlContext.createDataFrame(
+        sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
         StructType(StructField("a", dataType, nullable = true) :: Nil)
       )
       assert(TungstenSort.supportsSchema(inputDf.schema))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index d1f0b2b1fc52f..1739798a24e0a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -23,9 +23,10 @@ import scala.util.{Try, Random}
 
 import org.scalatest.Matchers
 
-import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.{TaskContextImpl, TaskContext, SparkFunSuite}
+import org.apache.spark.shuffle.ShuffleMemoryManager
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
@@ -199,9 +200,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val sorter = map.destructAndCreateExternalSorter()
 
     withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      // 4096 * 16 is the initial size allocated for the pointer/prefix array in the in-mem sorter.
-      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() ===
-        initialMemoryConsumption + 4096 * 16)
+      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
     }
 
     // Add more keys to the sorter and make sure the results come out sorted.
@@ -304,9 +303,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val sorter = map.destructAndCreateExternalSorter()
 
     withClue(s"destructAndCreateExternalSorter should release memory used by the map") {
-      // 4096 * 16 is the initial size allocated for the pointer/prefix array in the in-mem sorter.
-      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() ===
-        initialMemoryConsumption + 4096 * 16)
+      assert(shuffleMemoryManager.getMemoryConsumptionForThisTask() === initialMemoryConsumption)
     }
 
     // Add more keys to the sorter and make sure the results come out sorted.
@@ -325,7 +322,7 @@ class UnsafeFixedWidthAggregationMapSuite
       // At here, we also test if copy is correct.
       iter.getKey.copy()
       iter.getValue.copy()
-      count += 1;
+      count += 1
     }
 
     // 1 record was from the map and 4096 records were explicitly inserted.
@@ -333,4 +330,48 @@ class UnsafeFixedWidthAggregationMapSuite
 
     map.free()
   }
+
+  testWithMemoryLeakDetection("convert to external sorter under memory pressure (SPARK-10474)") {
+    val smm = ShuffleMemoryManager.createForTesting(65536)
+    val pageSize = 4096
+    val map = new UnsafeFixedWidthAggregationMap(
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
+      taskMemoryManager,
+      smm,
+      128, // initial capacity
+      pageSize,
+      false // disable perf metrics
+    )
+
+    // Insert into the map until we've run out of space
+    val rand = new Random(42)
+    var hasSpace = true
+    while (hasSpace) {
+      val str = rand.nextString(1024)
+      val buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
+      if (buf == null) {
+        hasSpace = false
+      } else {
+        buf.setInt(0, str.length)
+      }
+    }
+
+    // Ensure we're actually maxed out by asserting that we can't acquire even just 1 byte
+    assert(smm.tryToAcquire(1) === 0)
+
+    // Convert the map into a sorter. This used to fail before the fix for SPARK-10474
+    // because we would try to acquire space for the in-memory sorter pointer array before
+    // actually releasing the pages despite having spilled all of them.
+    var sorter: UnsafeKVExternalSorter = null
+    try {
+      sorter = map.destructAndCreateExternalSorter()
+    } finally {
+      if (sorter != null) {
+        sorter.cleanupResources()
+      }
+    }
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index bd02c73a26ace..f7d48bc53ebbc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -17,13 +17,18 @@
 
 package org.apache.spark.sql.execution
 
-import java.io.{DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream}
+import java.io.{File, ByteArrayInputStream, ByteArrayOutputStream}
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.util.collection.ExternalSorter
+import org.apache.spark.util.Utils
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
+import org.apache.spark._
 
 
 /**
@@ -37,12 +42,18 @@ class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStrea
   }
 }
 
-class UnsafeRowSerializerSuite extends SparkFunSuite {
+class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
 
   private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = {
-    val internalRow = CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow]
+    val converter = unsafeRowConverter(schema)
+    converter(row)
+  }
+
+  private def unsafeRowConverter(schema: Array[DataType]): Row => UnsafeRow = {
     val converter = UnsafeProjection.create(schema)
-    converter.apply(internalRow)
+    (row: Row) => {
+      converter(CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow])
+    }
   }
 
   test("toUnsafeRow() test helper method") {
@@ -77,14 +88,68 @@ class UnsafeRowSerializerSuite extends SparkFunSuite {
   }
 
   test("close empty input stream") {
-    val baos = new ByteArrayOutputStream()
-    val dout = new DataOutputStream(baos)
-    dout.writeInt(-1)  // EOF
-    dout.flush()
-    val input = new ClosableByteArrayInputStream(baos.toByteArray)
+    val input = new ClosableByteArrayInputStream(Array.empty)
     val serializer = new UnsafeRowSerializer(numFields = 2).newInstance()
     val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator
     assert(!deserializerIter.hasNext)
     assert(input.closed)
   }
+
+  test("SPARK-10466: external sorter spilling with unsafe row serializer") {
+    var sc: SparkContext = null
+    var outputFile: File = null
+    val oldEnv = SparkEnv.get // save the old SparkEnv, as it will be overwritten
+    Utils.tryWithSafeFinally {
+      val conf = new SparkConf()
+        .set("spark.shuffle.spill.initialMemoryThreshold", "1024")
+        .set("spark.shuffle.sort.bypassMergeThreshold", "0")
+        .set("spark.shuffle.memoryFraction", "0.0001")
+
+      sc = new SparkContext("local", "test", conf)
+      outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")
+      // prepare data
+      val converter = unsafeRowConverter(Array(IntegerType))
+      val data = (1 to 1000).iterator.map { i =>
+        (i, converter(Row(i)))
+      }
+      val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
+        partitioner = Some(new HashPartitioner(10)),
+        serializer = Some(new UnsafeRowSerializer(numFields = 1)))
+
+      // Ensure we spilled something and have to merge them later
+      assert(sorter.numSpills === 0)
+      sorter.insertAll(data)
+      assert(sorter.numSpills > 0)
+
+      // Merging spilled files should not throw assertion error
+      val taskContext =
+        new TaskContextImpl(0, 0, 0, 0, null, null, InternalAccumulator.create(sc))
+      taskContext.taskMetrics.shuffleWriteMetrics = Some(new ShuffleWriteMetrics)
+      sorter.writePartitionedFile(ShuffleBlockId(0, 0, 0), taskContext, outputFile)
+    } {
+      // Clean up
+      if (sc != null) {
+        sc.stop()
+      }
+
+      // restore the spark env
+      SparkEnv.set(oldEnv)
+
+      if (outputFile != null) {
+        outputFile.delete()
+      }
+    }
+  }
+
+  test("SPARK-10403: unsafe row serializer with UnsafeShuffleManager") {
+    val conf = new SparkConf()
+      .set("spark.shuffle.manager", "tungsten-sort")
+    sc = new SparkContext("local", "test", conf)
+    val row = Row("Hello", 123)
+    val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
+    val rowsRDD = sc.parallelize(Seq((0, unsafeRow), (1, unsafeRow), (0, unsafeRow)))
+      .asInstanceOf[RDD[Product2[Int, InternalRow]]]
+    val shuffled = new ShuffledRowRDD(rowsRDD, new UnsafeRowSerializer(2), 2)
+    shuffled.count()
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index 5fdb82b067728..afda0d29f6d91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -37,7 +37,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
       val newMutableProjection = (expr: Seq[Expression], schema: Seq[Attribute]) => {
         () => new InterpretedMutableProjection(expr, schema)
       }
-      val dummyAccum = SQLMetrics.createLongMetric(ctx.sparkContext, "dummy")
+      val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
       iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, 0,
         Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 1174b27732f22..b614e6c4148fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -24,7 +24,7 @@ import com.fasterxml.jackson.core.JsonFactory
 import org.apache.spark.rdd.RDD
 import org.scalactic.Tolerance._
 
-import org.apache.spark.sql.{QueryTest, Row, SQLConf}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.json.InferSchema.compatibleType
@@ -215,7 +215,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Complex field and type inferring with null in sampling") {
-    val jsonDF = ctx.read.json(jsonNullStruct)
+    val jsonDF = sqlContext.read.json(jsonNullStruct)
     val expectedSchema = StructType(
       StructField("headers", StructType(
         StructField("Charset", StringType, true) ::
@@ -234,7 +234,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Primitive field and type inferring") {
-    val jsonDF = ctx.read.json(primitiveFieldAndType)
+    val jsonDF = sqlContext.read.json(primitiveFieldAndType)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType(20, 0), true) ::
@@ -262,7 +262,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Complex field and type inferring") {
-    val jsonDF = ctx.read.json(complexFieldAndType1)
+    val jsonDF = sqlContext.read.json(complexFieldAndType1)
 
     val expectedSchema = StructType(
       StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
@@ -361,7 +361,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("GetField operation on complex data type") {
-    val jsonDF = ctx.read.json(complexFieldAndType1)
+    val jsonDF = sqlContext.read.json(complexFieldAndType1)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -377,7 +377,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Type conflict in primitive field values") {
-    val jsonDF = ctx.read.json(primitiveFieldValueTypeConflict)
+    val jsonDF = sqlContext.read.json(primitiveFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("num_bool", StringType, true) ::
@@ -449,7 +449,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   ignore("Type conflict in primitive field values (Ignored)") {
-    val jsonDF = ctx.read.json(primitiveFieldValueTypeConflict)
+    val jsonDF = sqlContext.read.json(primitiveFieldValueTypeConflict)
     jsonDF.registerTempTable("jsonTable")
 
     // Right now, the analyzer does not promote strings in a boolean expression.
@@ -502,7 +502,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Type conflict in complex field values") {
-    val jsonDF = ctx.read.json(complexFieldValueTypeConflict)
+    val jsonDF = sqlContext.read.json(complexFieldValueTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array", ArrayType(LongType, true), true) ::
@@ -526,7 +526,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Type conflict in array elements") {
-    val jsonDF = ctx.read.json(arrayElementTypeConflict)
+    val jsonDF = sqlContext.read.json(arrayElementTypeConflict)
 
     val expectedSchema = StructType(
       StructField("array1", ArrayType(StringType, true), true) ::
@@ -554,7 +554,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("Handling missing fields") {
-    val jsonDF = ctx.read.json(missingFields)
+    val jsonDF = sqlContext.read.json(missingFields)
 
     val expectedSchema = StructType(
       StructField("a", BooleanType, true) ::
@@ -573,9 +573,9 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val dir = Utils.createTempDir()
     dir.delete()
     val path = dir.getCanonicalFile.toURI.toString
-    ctx.sparkContext.parallelize(1 to 100)
+    sparkContext.parallelize(1 to 100)
       .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
-    val jsonDF = ctx.read.option("samplingRatio", "0.49").json(path)
+    val jsonDF = sqlContext.read.option("samplingRatio", "0.49").json(path)
 
     val analyzed = jsonDF.queryExecution.analyzed
     assert(
@@ -590,7 +590,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
     val schema = StructType(StructField("a", LongType, true) :: Nil)
     val logicalRelation =
-      ctx.read.schema(schema).json(path)
+      sqlContext.read.schema(schema).json(path)
         .queryExecution.analyzed.asInstanceOf[LogicalRelation]
     val relationWithSchema = logicalRelation.relation.asInstanceOf[JSONRelation]
     assert(relationWithSchema.paths === Array(path))
@@ -603,7 +603,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     dir.delete()
     val path = dir.getCanonicalPath
     primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
-    val jsonDF = ctx.read.json(path)
+    val jsonDF = sqlContext.read.json(path)
 
     val expectedSchema = StructType(
       StructField("bigInteger", DecimalType(20, 0), true) ::
@@ -672,7 +672,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       StructField("null", StringType, true) ::
       StructField("string", StringType, true) :: Nil)
 
-    val jsonDF1 = ctx.read.schema(schema).json(path)
+    val jsonDF1 = sqlContext.read.schema(schema).json(path)
 
     assert(schema === jsonDF1.schema)
 
@@ -689,7 +689,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       "this is a simple string.")
     )
 
-    val jsonDF2 = ctx.read.schema(schema).json(primitiveFieldAndType)
+    val jsonDF2 = sqlContext.read.schema(schema).json(primitiveFieldAndType)
 
     assert(schema === jsonDF2.schema)
 
@@ -710,7 +710,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   test("Applying schemas with MapType") {
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    val jsonWithSimpleMap = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
+    val jsonWithSimpleMap = sqlContext.read.schema(schemaWithSimpleMap).json(mapType1)
 
     jsonWithSimpleMap.registerTempTable("jsonWithSimpleMap")
 
@@ -738,7 +738,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val schemaWithComplexMap = StructType(
       StructField("map", MapType(StringType, innerStruct, true), false) :: Nil)
 
-    val jsonWithComplexMap = ctx.read.schema(schemaWithComplexMap).json(mapType2)
+    val jsonWithComplexMap = sqlContext.read.schema(schemaWithComplexMap).json(mapType2)
 
     jsonWithComplexMap.registerTempTable("jsonWithComplexMap")
 
@@ -764,7 +764,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-2096 Correctly parse dot notations") {
-    val jsonDF = ctx.read.json(complexFieldAndType2)
+    val jsonDF = sqlContext.read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -782,7 +782,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-3390 Complex arrays") {
-    val jsonDF = ctx.read.json(complexFieldAndType2)
+    val jsonDF = sqlContext.read.json(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -805,7 +805,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-3308 Read top level JSON arrays") {
-    val jsonDF = ctx.read.json(jsonArray)
+    val jsonDF = sqlContext.read.json(jsonArray)
     jsonDF.registerTempTable("jsonTable")
 
     checkAnswer(
@@ -823,64 +823,63 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
   test("Corrupt records") {
     // Test if we can query corrupt records.
-    val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
-    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
-
-    val jsonDF = ctx.read.json(corruptRecords)
-    jsonDF.registerTempTable("jsonTable")
-
-    val schema = StructType(
-      StructField("_unparsed", StringType, true) ::
-      StructField("a", StringType, true) ::
-      StructField("b", StringType, true) ::
-      StructField("c", StringType, true) :: Nil)
-
-    assert(schema === jsonDF.schema)
-
-    // In HiveContext, backticks should be used to access columns starting with a underscore.
-    checkAnswer(
-      sql(
-        """
-          |SELECT a, b, c, _unparsed
-          |FROM jsonTable
-        """.stripMargin),
-      Row(null, null, null, "{") ::
-        Row(null, null, null, "") ::
-        Row(null, null, null, """{"a":1, b:2}""") ::
-        Row(null, null, null, """{"a":{, b:3}""") ::
-        Row("str_a_4", "str_b_4", "str_c_4", null) ::
-        Row(null, null, null, "]") :: Nil
-    )
-
-    checkAnswer(
-      sql(
-        """
-          |SELECT a, b, c
-          |FROM jsonTable
-          |WHERE _unparsed IS NULL
-        """.stripMargin),
-      Row("str_a_4", "str_b_4", "str_c_4")
-    )
-
-    checkAnswer(
-      sql(
-        """
-          |SELECT _unparsed
-          |FROM jsonTable
-          |WHERE _unparsed IS NOT NULL
-        """.stripMargin),
-      Row("{") ::
-        Row("") ::
-        Row("""{"a":1, b:2}""") ::
-        Row("""{"a":{, b:3}""") ::
-        Row("]") :: Nil
-    )
-
-    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+    withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") {
+      withTempTable("jsonTable") {
+        val jsonDF = sqlContext.read.json(corruptRecords)
+        jsonDF.registerTempTable("jsonTable")
+
+        val schema = StructType(
+          StructField("_unparsed", StringType, true) ::
+          StructField("a", StringType, true) ::
+          StructField("b", StringType, true) ::
+          StructField("c", StringType, true) :: Nil)
+
+        assert(schema === jsonDF.schema)
+
+        // In HiveContext, backticks should be used to access columns starting with a underscore.
+        checkAnswer(
+          sql(
+            """
+              |SELECT a, b, c, _unparsed
+              |FROM jsonTable
+            """.stripMargin),
+          Row(null, null, null, "{") ::
+            Row(null, null, null, "") ::
+            Row(null, null, null, """{"a":1, b:2}""") ::
+            Row(null, null, null, """{"a":{, b:3}""") ::
+            Row("str_a_4", "str_b_4", "str_c_4", null) ::
+            Row(null, null, null, "]") :: Nil
+        )
+
+        checkAnswer(
+          sql(
+            """
+              |SELECT a, b, c
+              |FROM jsonTable
+              |WHERE _unparsed IS NULL
+            """.stripMargin),
+          Row("str_a_4", "str_b_4", "str_c_4")
+        )
+
+        checkAnswer(
+          sql(
+            """
+              |SELECT _unparsed
+              |FROM jsonTable
+              |WHERE _unparsed IS NOT NULL
+            """.stripMargin),
+          Row("{") ::
+            Row("") ::
+            Row("""{"a":1, b:2}""") ::
+            Row("""{"a":{, b:3}""") ::
+            Row("]") :: Nil
+        )
+      }
+    }
   }
 
   test("SPARK-4068: nulls in arrays") {
-    val jsonDF = ctx.read.json(nullsInArrays)
+    val jsonDF = sqlContext.read.json(nullsInArrays)
     jsonDF.registerTempTable("jsonTable")
 
     val schema = StructType(
@@ -926,7 +925,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5)
     }
 
-    val df1 = ctx.createDataFrame(rowRDD1, schema1)
+    val df1 = sqlContext.createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     val df2 = df1.toDF
     val result = df2.toJSON.collect()
@@ -949,7 +948,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df3 = ctx.createDataFrame(rowRDD2, schema2)
+    val df3 = sqlContext.createDataFrame(rowRDD2, schema2)
     df3.registerTempTable("applySchema2")
     val df4 = df3.toDF
     val result2 = df4.toJSON.collect()
@@ -957,8 +956,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
     assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}")
 
-    val jsonDF = ctx.read.json(primitiveFieldAndType)
-    val primTable = ctx.read.json(jsonDF.toJSON)
+    val jsonDF = sqlContext.read.json(primitiveFieldAndType)
+    val primTable = sqlContext.read.json(jsonDF.toJSON)
     primTable.registerTempTable("primativeTable")
     checkAnswer(
         sql("select * from primativeTable"),
@@ -970,8 +969,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
         "this is a simple string.")
       )
 
-    val complexJsonDF = ctx.read.json(complexFieldAndType1)
-    val compTable = ctx.read.json(complexJsonDF.toJSON)
+    val complexJsonDF = sqlContext.read.json(complexFieldAndType1)
+    val compTable = sqlContext.read.json(complexJsonDF.toJSON)
     compTable.registerTempTable("complexTable")
     // Access elements of a primitive array.
     checkAnswer(
@@ -1039,25 +1038,25 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Some(empty),
       1.0,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation0 = LogicalRelation(relation0)
     val relation1 = new JSONRelation(
       Some(singleRow),
       1.0,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation1 = LogicalRelation(relation1)
     val relation2 = new JSONRelation(
       Some(singleRow),
       0.5,
       Some(StructType(StructField("a", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation2 = LogicalRelation(relation2)
     val relation3 = new JSONRelation(
       Some(singleRow),
       1.0,
       Some(StructType(StructField("b", IntegerType, true) :: Nil)),
-      None, None)(ctx)
+      None, None)(sqlContext)
     val logicalRelation3 = LogicalRelation(relation3)
 
     assert(relation0 !== relation1)
@@ -1078,18 +1077,18 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
     withTempPath(dir => {
       val path = dir.getCanonicalFile.toURI.toString
-      ctx.sparkContext.parallelize(1 to 100)
+      sparkContext.parallelize(1 to 100)
         .map(i => s"""{"a": 1, "b": "str$i"}""").saveAsTextFile(path)
 
       val d1 = ResolvedDataSource(
-        ctx,
+        sqlContext,
         userSpecifiedSchema = None,
         partitionColumns = Array.empty[String],
         provider = classOf[DefaultSource].getCanonicalName,
         options = Map("path" -> path))
 
       val d2 = ResolvedDataSource(
-        ctx,
+        sqlContext,
         userSpecifiedSchema = None,
         partitionColumns = Array.empty[String],
         provider = classOf[DefaultSource].getCanonicalName,
@@ -1105,24 +1104,21 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-7565 MapType in JsonRDD") {
-    val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
-    ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
-
-    val schemaWithSimpleMap = StructType(
-      StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    try {
-      val temp = Utils.createTempDir().getPath
-
-      val df = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
-      df.write.mode("overwrite").parquet(temp)
-      // order of MapType is not defined
-      assert(ctx.read.parquet(temp).count() == 5)
-
-      val df2 = ctx.read.json(corruptRecords)
-      df2.write.mode("overwrite").parquet(temp)
-      checkAnswer(ctx.read.parquet(temp), df2.collect())
-    } finally {
-      ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
+    withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") {
+      withTempDir { dir =>
+        val schemaWithSimpleMap = StructType(
+          StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
+        val df = sqlContext.read.schema(schemaWithSimpleMap).json(mapType1)
+
+        val path = dir.getAbsolutePath
+        df.write.mode("overwrite").parquet(path)
+        // order of MapType is not defined
+        assert(sqlContext.read.parquet(path).count() == 5)
+
+        val df2 = sqlContext.read.json(corruptRecords)
+        df2.write.mode("overwrite").parquet(path)
+        checkAnswer(sqlContext.read.parquet(path), df2.collect())
+      }
     }
   }
 
@@ -1142,19 +1138,19 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       val d1 = new File(root, "d1=1")
       // root/dt=1/col1=abc
       val p1_col1 = makePartition(
-        ctx.sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""),
+        sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""),
         d1,
         "col1",
         "abc")
 
       // root/dt=1/col1=abd
       val p2 = makePartition(
-        ctx.sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""),
+        sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""),
         d1,
         "col1",
         "abd")
 
-        ctx.read.json(root.getAbsolutePath).registerTempTable("test_myjson_with_part")
+        sqlContext.read.json(root.getAbsolutePath).registerTempTable("test_myjson_with_part")
         checkAnswer(sql(
           "SELECT count(a) FROM test_myjson_with_part where d1 = 1 and col1='abc'"), Row(4))
         checkAnswer(sql(
@@ -1163,4 +1159,105 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
           "SELECT count(a) FROM test_myjson_with_part where d1 = 1"), Row(9))
     })
   }
+
+  test("backward compatibility") {
+    // This test we make sure our JSON support can read JSON data generated by previous version
+    // of Spark generated through toJSON method and JSON data source.
+    // The data is generated by the following program.
+    // Here are a few notes:
+    //  - Spark 1.5.0 cannot save timestamp data. So, we manually added timestamp field (col13)
+    //      in the JSON object.
+    //  - For Spark before 1.5.1, we do not generate UDTs. So, we manually added the UDT value to
+    //      JSON objects generated by those Spark versions (col17).
+    //  - If the type is NullType, we do not write data out.
+
+    // Create the schema.
+    val struct =
+      StructType(
+        StructField("f1", FloatType, true) ::
+          StructField("f2", ArrayType(BooleanType), true) :: Nil)
+
+    val dataTypes =
+      Seq(
+        StringType, BinaryType, NullType, BooleanType,
+        ByteType, ShortType, IntegerType, LongType,
+        FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+        DateType, TimestampType,
+        ArrayType(IntegerType), MapType(StringType, LongType), struct,
+        new MyDenseVectorUDT())
+    val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
+      StructField(s"col$index", dataType, nullable = true)
+    }
+    val schema = StructType(fields)
+
+    val constantValues =
+      Seq(
+        "a string in binary".getBytes("UTF-8"),
+        null,
+        true,
+        1.toByte,
+        2.toShort,
+        3,
+        Long.MaxValue,
+        0.25.toFloat,
+        0.75,
+        new java.math.BigDecimal(s"1234.23456"),
+        new java.math.BigDecimal(s"1.23456"),
+        java.sql.Date.valueOf("2015-01-01"),
+        java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123"),
+        Seq(2, 3, 4),
+        Map("a string" -> 2000L),
+        Row(4.75.toFloat, Seq(false, true)),
+        new MyDenseVector(Array(0.25, 2.25, 4.25)))
+    val data =
+      Row.fromSeq(Seq("Spark " + sqlContext.sparkContext.version) ++ constantValues) :: Nil
+
+    // Data generated by previous versions.
+    // scalastyle:off
+    val existingJSONData =
+      """{"col0":"Spark 1.2.2","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.3.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.3.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.4.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.4.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.5.0","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" ::
+      """{"col0":"Spark 1.5.0","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"16436","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: Nil
+    // scalastyle:on
+
+    // Generate data for the current version.
+    val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(data, 1), schema)
+    withTempPath { path =>
+      df.write.format("json").mode("overwrite").save(path.getCanonicalPath)
+
+      // df.toJSON will convert internal rows to external rows first and then generate
+      // JSON objects. While, df.write.format("json") will write internal rows directly.
+      val allJSON =
+        existingJSONData ++
+          df.toJSON.collect() ++
+          sparkContext.textFile(path.getCanonicalPath).collect()
+
+      Utils.deleteRecursively(path)
+      sparkContext.parallelize(allJSON, 1).saveAsTextFile(path.getCanonicalPath)
+
+      // Read data back with the schema specified.
+      val col0Values =
+        Seq(
+          "Spark 1.2.2",
+          "Spark 1.3.1",
+          "Spark 1.3.1",
+          "Spark 1.4.1",
+          "Spark 1.4.1",
+          "Spark 1.5.0",
+          "Spark 1.5.0",
+          "Spark " + sqlContext.sparkContext.version,
+          "Spark " + sqlContext.sparkContext.version)
+      val expectedResult = col0Values.map { v =>
+        Row.fromSeq(Seq(v) ++ constantValues)
+      }
+      checkAnswer(
+        sqlContext.read.format("json").schema(schema).load(path.getCanonicalPath),
+        expectedResult
+      )
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
index 2864181cf91d5..713d1da1cb515 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala
@@ -21,10 +21,10 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
 
 private[json] trait TestJsonData {
-  protected def _sqlContext: SQLContext
+  protected def sqlContext: SQLContext
 
   def primitiveFieldAndType: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"string":"this is a simple string.",
           "integer":10,
           "long":21474836470,
@@ -35,7 +35,7 @@ private[json] trait TestJsonData {
       }"""  :: Nil)
 
   def primitiveFieldValueTypeConflict: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"num_num_1":11, "num_num_2":null, "num_num_3": 1.1,
           "num_bool":true, "num_str":13.1, "str_bool":"str1"}""" ::
       """{"num_num_1":null, "num_num_2":21474836470.9, "num_num_3": null,
@@ -46,14 +46,14 @@ private[json] trait TestJsonData {
           "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
 
   def jsonNullStruct: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
         """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
         """{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
         """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
 
   def complexFieldValueTypeConflict: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"num_struct":11, "str_array":[1, 2, 3],
           "array":[], "struct_array":[], "struct": {}}""" ::
       """{"num_struct":{"field":false}, "str_array":null,
@@ -64,14 +64,14 @@ private[json] trait TestJsonData {
           "array":[7], "struct_array":{"field": true}, "struct": {"field": "str"}}""" :: Nil)
 
   def arrayElementTypeConflict: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
           "array2": [{"field":214748364700}, {"field":1}]}""" ::
       """{"array3": [{"field":"str"}, {"field":1}]}""" ::
       """{"array3": [1, 2, 3]}""" :: Nil)
 
   def missingFields: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"a":true}""" ::
       """{"b":21474836470}""" ::
       """{"c":[33, 44]}""" ::
@@ -79,7 +79,7 @@ private[json] trait TestJsonData {
       """{"e":"str"}""" :: Nil)
 
   def complexFieldAndType1: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"struct":{"field1": true, "field2": 92233720368547758070},
           "structWithArrayFields":{"field1":[4, 5, 6], "field2":["str1", "str2"]},
           "arrayOfString":["str1", "str2"],
@@ -95,7 +95,7 @@ private[json] trait TestJsonData {
          }"""  :: Nil)
 
   def complexFieldAndType2: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"arrayOfStruct":[{"field1": true, "field2": "str1"}, {"field1": false}, {"field3": null}],
           "complexArrayOfStruct": [
           {
@@ -149,7 +149,7 @@ private[json] trait TestJsonData {
       }""" :: Nil)
 
   def mapType1: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"map": {"a": 1}}""" ::
       """{"map": {"b": 2}}""" ::
       """{"map": {"c": 3}}""" ::
@@ -157,7 +157,7 @@ private[json] trait TestJsonData {
       """{"map": {"e": null}}""" :: Nil)
 
   def mapType2: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"map": {"a": {"field1": [1, 2, 3, null]}}}""" ::
       """{"map": {"b": {"field2": 2}}}""" ::
       """{"map": {"c": {"field1": [], "field2": 4}}}""" ::
@@ -166,21 +166,21 @@ private[json] trait TestJsonData {
       """{"map": {"f": {"field1": null}}}""" :: Nil)
 
   def nullsInArrays: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{"field1":[[null], [[["Test"]]]]}""" ::
       """{"field2":[null, [{"Test":1}]]}""" ::
       """{"field3":[[null], [{"Test":"2"}]]}""" ::
       """{"field4":[[null, [1,2,3]]]}""" :: Nil)
 
   def jsonArray: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """[{"a":"str_a_1"}]""" ::
       """[{"a":"str_a_2"}, {"b":"str_b_3"}]""" ::
       """{"b":"str_b_4", "a":"str_a_4", "c":"str_c_4"}""" ::
       """[]""" :: Nil)
 
   def corruptRecords: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{""" ::
       """""" ::
       """{"a":1, b:2}""" ::
@@ -189,7 +189,7 @@ private[json] trait TestJsonData {
       """]""" :: Nil)
 
   def emptyRecords: RDD[String] =
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       """{""" ::
         """""" ::
         """{"a": {}}""" ::
@@ -198,7 +198,7 @@ private[json] trait TestJsonData {
         """]""" :: Nil)
 
 
-  lazy val singleRow: RDD[String] = _sqlContext.sparkContext.parallelize("""{"a":123}""" :: Nil)
+  lazy val singleRow: RDD[String] = sqlContext.sparkContext.parallelize("""{"a":123}""" :: Nil)
 
-  def empty: RDD[String] = _sqlContext.sparkContext.parallelize(Seq[String]())
+  def empty: RDD[String] = sqlContext.sparkContext.parallelize(Seq[String]())
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
index bd7cf8c10abef..36b929ee1f409 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetAvroCompatibilitySuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import java.io.File
 import java.nio.ByteBuffer
 import java.util.{List => JList, Map => JMap}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
index df68432faeeb3..0835bd123049b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompatibilityTest.scala
@@ -17,11 +17,15 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import scala.collection.JavaConverters._
+import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter}
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{Path, PathFilter}
-import org.apache.parquet.hadoop.ParquetFileReader
-import org.apache.parquet.schema.MessageType
+import org.apache.parquet.hadoop.api.WriteSupport
+import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
+import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter}
+import org.apache.parquet.io.api.RecordConsumer
+import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
 import org.apache.spark.sql.QueryTest
 
@@ -35,14 +39,14 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
 
   protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = {
     val fsPath = new Path(path)
-    val fs = fsPath.getFileSystem(configuration)
+    val fs = fsPath.getFileSystem(hadoopConfiguration)
     val parquetFiles = fs.listStatus(fsPath, new PathFilter {
       override def accept(path: Path): Boolean = pathFilter(path)
-    }).toSeq
+    }).toSeq.asJava
 
     val footers =
-      ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles.asJava, true)
-    footers.iterator().next().getParquetMetadata.getFileMetaData.getSchema
+      ParquetFileReader.readAllFootersInParallel(hadoopConfiguration, parquetFiles, true)
+    footers.asScala.head.getParquetMetadata.getFileMetaData.getSchema
   }
 
   protected def logParquetSchema(path: String): Unit = {
@@ -53,8 +57,69 @@ private[sql] abstract class ParquetCompatibilityTest extends QueryTest with Parq
   }
 }
 
-object ParquetCompatibilityTest {
-  def makeNullable[T <: AnyRef](i: Int)(f: => T): T = {
-    if (i % 3 == 0) null.asInstanceOf[T] else f
+private[sql] object ParquetCompatibilityTest {
+  implicit class RecordConsumerDSL(consumer: RecordConsumer) {
+    def message(f: => Unit): Unit = {
+      consumer.startMessage()
+      f
+      consumer.endMessage()
+    }
+
+    def group(f: => Unit): Unit = {
+      consumer.startGroup()
+      f
+      consumer.endGroup()
+    }
+
+    def field(name: String, index: Int)(f: => Unit): Unit = {
+      consumer.startField(name, index)
+      f
+      consumer.endField(name, index)
+    }
+  }
+
+  /**
+   * A testing Parquet [[WriteSupport]] implementation used to write manually constructed Parquet
+   * records with arbitrary structures.
+   */
+  private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String])
+    extends WriteSupport[RecordConsumer => Unit] {
+
+    private var recordConsumer: RecordConsumer = _
+
+    override def init(configuration: Configuration): WriteContext = {
+      new WriteContext(schema, metadata.asJava)
+    }
+
+    override def write(recordWriter: RecordConsumer => Unit): Unit = {
+      recordWriter.apply(recordConsumer)
+    }
+
+    override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
+      this.recordConsumer = recordConsumer
+    }
+  }
+
+  /**
+   * Writes arbitrary messages conforming to a given `schema` to a Parquet file located by `path`.
+   * Records are produced by `recordWriters`.
+   */
+  def writeDirect(path: String, schema: String, recordWriters: (RecordConsumer => Unit)*): Unit = {
+    writeDirect(path, schema, Map.empty[String, String], recordWriters: _*)
+  }
+
+  /**
+   * Writes arbitrary messages conforming to a given `schema` to a Parquet file located by `path`
+   * with given user-defined key-value `metadata`. Records are produced by `recordWriters`.
+   */
+  def writeDirect(
+      path: String,
+      schema: String,
+      metadata: Map[String, String],
+      recordWriters: (RecordConsumer => Unit)*): Unit = {
+    val messageType = MessageTypeParser.parseMessageType(schema)
+    val writeSupport = new DirectWriteSupport(messageType, metadata)
+    val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport)
+    try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close()
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index f067112cfca95..45ad3fde559c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -55,7 +55,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
         .where(Column(predicate))
 
       val analyzedPredicate = query.queryExecution.optimizedPlan.collect {
-        case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation)) => filters
+        case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation, _)) => filters
       }.flatten
       assert(analyzedPredicate.nonEmpty)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 08d2b9dee99b0..cd552e83372f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -101,7 +101,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
   test("fixed-length decimals") {
     def makeDecimalRDD(decimal: DecimalType): DataFrame =
-      sqlContext.sparkContext
+      sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(i / 100.0))
         .toDF()
@@ -119,7 +119,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
   test("date type") {
     def makeDateRDD(): DataFrame =
-      sqlContext.sparkContext
+      sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(DateTimeUtils.toJavaDate(i)))
         .toDF()
@@ -207,7 +207,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   test("compression codec") {
     def compressionCodecFor(path: String): String = {
       val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(configuration)).getBlocks.asScala
+        .readMetaData(new Path(path), Some(hadoopConfiguration)).getBlocks.asScala
         .flatMap(_.getColumns.asScala)
         .map(_.getCodec.name())
         .distinct
@@ -277,14 +277,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   test("write metadata") {
     withTempPath { file =>
       val path = new Path(file.toURI.toString)
-      val fs = FileSystem.getLocal(configuration)
+      val fs = FileSystem.getLocal(hadoopConfiguration)
       val attributes = ScalaReflection.attributesFor[(Int, String)]
-      ParquetTypesConverter.writeMetaData(attributes, path, configuration)
+      ParquetTypesConverter.writeMetaData(attributes, path, hadoopConfiguration)
 
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
+      val metaData = ParquetTypesConverter.readMetaData(path, Some(hadoopConfiguration))
       val actualSchema = metaData.getFileMetaData.getSchema
       val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
 
@@ -355,7 +355,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       val path = new Path(location.getCanonicalPath)
 
       ParquetFileWriter.writeMetadataFile(
-        sqlContext.sparkContext.hadoopConfiguration,
+        sparkContext.hadoopConfiguration,
         path,
         Collections.singletonList(
           new Footer(path, new ParquetMetadata(fileMetadata, Collections.emptyList()))))
@@ -370,12 +370,12 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("SPARK-6352 DirectParquetOutputCommitter") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
 
     // Write to a parquet file and let it fail.
     // _temporary should be missing if direct output committer works.
     try {
-      configuration.set("spark.sql.parquet.output.committer.class",
+      hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
         classOf[DirectParquetOutputCommitter].getCanonicalName)
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
@@ -383,23 +383,23 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
-        val fs = path.getFileSystem(configuration)
+        val fs = path.getFileSystem(hadoopConfiguration)
         assert(!fs.exists(path))
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 
   test("SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
 
     // Write to a parquet file and let it fail.
     // _temporary should be missing if direct output committer works.
     try {
-      configuration.set("spark.sql.parquet.output.committer.class",
+      hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
         "org.apache.spark.sql.parquet.DirectParquetOutputCommitter")
       sqlContext.udf.register("div0", (x: Int) => x / 0)
       withTempPath { dir =>
@@ -407,25 +407,25 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
           sqlContext.sql("select div0(1)").write.parquet(dir.getCanonicalPath)
         }
         val path = new Path(dir.getCanonicalPath, "_temporary")
-        val fs = path.getFileSystem(configuration)
+        val fs = path.getFileSystem(hadoopConfiguration)
         assert(!fs.exists(path))
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 
 
   test("SPARK-8121: spark.sql.parquet.output.committer.class shouldn't be overridden") {
     withTempPath { dir =>
-      val clonedConf = new Configuration(configuration)
+      val clonedConf = new Configuration(hadoopConfiguration)
 
-      configuration.set(
+      hadoopConfiguration.set(
         SQLConf.OUTPUT_COMMITTER_CLASS.key, classOf[ParquetOutputCommitter].getCanonicalName)
 
-      configuration.set(
+      hadoopConfiguration.set(
         "spark.sql.parquet.output.committer.class",
         classOf[JobCommitFailureParquetOutputCommitter].getCanonicalName)
 
@@ -436,8 +436,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
         assert(message === "Intentional exception for testing purposes")
       } finally {
         // Hadoop 1 doesn't have `Configuration.unset`
-        configuration.clear()
-        clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+        hadoopConfiguration.clear()
+        clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
       }
     }
   }
@@ -455,11 +455,11 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("SPARK-7837 Do not close output writer twice when commitTask() fails") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
 
     // Using a output committer that always fail when committing a task, so that both
     // `commitTask()` and `abortTask()` are invoked.
-    configuration.set(
+    hadoopConfiguration.set(
       "spark.sql.parquet.output.committer.class",
       classOf[TaskCommitFailureParquetOutputCommitter].getCanonicalName)
 
@@ -483,8 +483,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
new file mode 100644
index 0000000000000..83b65fb419ed3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.io.File
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedSQLContext {
+  test("parquet files with different physical schemas but share the same logical schema") {
+    import ParquetCompatibilityTest._
+
+    // This test case writes two Parquet files, both representing the following Catalyst schema
+    //
+    //   StructType(
+    //     StructField(
+    //       "f",
+    //       ArrayType(IntegerType, containsNull = false),
+    //       nullable = false))
+    //
+    // The first Parquet file comes with parquet-avro style 2-level LIST-annotated group, while the
+    // other one comes with parquet-protobuf style 1-level unannotated primitive field.
+    withTempDir { dir =>
+      val avroStylePath = new File(dir, "avro-style").getCanonicalPath
+      val protobufStylePath = new File(dir, "protobuf-style").getCanonicalPath
+
+      val avroStyleSchema =
+        """message avro_style {
+          |  required group f (LIST) {
+          |    repeated int32 array;
+          |  }
+          |}
+        """.stripMargin
+
+      writeDirect(avroStylePath, avroStyleSchema, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.group {
+              rc.field("array", 0) {
+                rc.addInteger(0)
+                rc.addInteger(1)
+              }
+            }
+          }
+        }
+      })
+
+      logParquetSchema(avroStylePath)
+
+      val protobufStyleSchema =
+        """message protobuf_style {
+          |  repeated int32 f;
+          |}
+        """.stripMargin
+
+      writeDirect(protobufStylePath, protobufStyleSchema, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.addInteger(2)
+            rc.addInteger(3)
+          }
+        }
+      })
+
+      logParquetSchema(protobufStylePath)
+
+      checkAnswer(
+        sqlContext.read.parquet(dir.getCanonicalPath),
+        Seq(
+          Row(Seq(0, 1)),
+          Row(Seq(2, 3))))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index ed8bafb10c60b..3a23b8ed66808 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -465,7 +465,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = sqlContext.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: ParquetRelation) =>
+        case LogicalRelation(relation: ParquetRelation, _) =>
           assert(relation.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
@@ -517,7 +517,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     }
 
     val schema = StructType(partitionColumns :+ StructField(s"i", StringType))
-    val df = sqlContext.createDataFrame(sqlContext.sparkContext.parallelize(row :: Nil), schema)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(row :: Nil), schema)
 
     withTempPath { dir =>
       df.write.format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index b7b70c2bbbd5c..1c1cfa34ad04b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,6 +22,9 @@ import java.io.File
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
+import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -30,6 +33,7 @@ import org.apache.spark.util.Utils
  * A test suite that tests various Parquet queries.
  */
 class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext {
+  import testImplicits._
 
   test("simple select queries") {
     withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
@@ -40,22 +44,22 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
 
   test("appending") {
     val data = (0 until 10).map(i => (i, i.toString))
-    ctx.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
-      checkAnswer(ctx.table("t"), (data ++ data).map(Row.fromTuple))
+      checkAnswer(sqlContext.table("t"), (data ++ data).map(Row.fromTuple))
     }
-    ctx.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(Seq("tmp"))
   }
 
   test("overwriting") {
     val data = (0 until 10).map(i => (i, i.toString))
-    ctx.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
+    sqlContext.createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
-      checkAnswer(ctx.table("t"), data.map(Row.fromTuple))
+      checkAnswer(sqlContext.table("t"), data.map(Row.fromTuple))
     }
-    ctx.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(Seq("tmp"))
   }
 
   test("self-join") {
@@ -118,9 +122,9 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     val schema = StructType(List(StructField("d", DecimalType(18, 0), false),
       StructField("time", TimestampType, false)).toArray)
     withTempPath { file =>
-      val df = ctx.createDataFrame(ctx.sparkContext.parallelize(data), schema)
+      val df = sqlContext.createDataFrame(sparkContext.parallelize(data), schema)
       df.write.parquet(file.getCanonicalPath)
-      val df2 = ctx.read.parquet(file.getCanonicalPath)
+      val df2 = sqlContext.read.parquet(file.getCanonicalPath)
       checkAnswer(df2, df.collect().toSeq)
     }
   }
@@ -129,12 +133,12 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     def testSchemaMerging(expectedColumnNumber: Int): Unit = {
       withTempDir { dir =>
         val basePath = dir.getCanonicalPath
-        ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
-        ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
+        sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+        sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
         // delete summary files, so if we don't merge part-files, one column will not be included.
         Utils.deleteRecursively(new File(basePath + "/foo=1/_metadata"))
         Utils.deleteRecursively(new File(basePath + "/foo=1/_common_metadata"))
-        assert(ctx.read.parquet(basePath).columns.length === expectedColumnNumber)
+        assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber)
       }
     }
 
@@ -153,9 +157,9 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     def testSchemaMerging(expectedColumnNumber: Int): Unit = {
       withTempDir { dir =>
         val basePath = dir.getCanonicalPath
-        ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
-        ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
-        assert(ctx.read.parquet(basePath).columns.length === expectedColumnNumber)
+        sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+        sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
+        assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber)
       }
     }
 
@@ -171,19 +175,19 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
   test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") {
     withTempPath { dir =>
       val basePath = dir.getCanonicalPath
-      ctx.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
-      ctx.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString)
+      sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+      sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString)
 
       // Disables the global SQL option for schema merging
       withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") {
         assertResult(2) {
           // Disables schema merging via data source option
-          ctx.read.option("mergeSchema", "false").parquet(basePath).columns.length
+          sqlContext.read.option("mergeSchema", "false").parquet(basePath).columns.length
         }
 
         assertResult(3) {
           // Enables schema merging via data source option
-          ctx.read.option("mergeSchema", "true").parquet(basePath).columns.length
+          sqlContext.read.option("mergeSchema", "true").parquet(basePath).columns.length
         }
       }
     }
@@ -193,7 +197,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     withTempPath { dir =>
       val basePath = dir.getCanonicalPath
       val schema = StructType(Array(StructField("name", DecimalType(10, 5), false)))
-      val rowRDD = sqlContext.sparkContext.parallelize(Array(Row(Decimal("67123.45"))))
+      val rowRDD = sparkContext.parallelize(Array(Row(Decimal("67123.45"))))
       val df = sqlContext.createDataFrame(rowRDD, schema)
       df.write.parquet(basePath)
 
@@ -203,9 +207,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
   }
 
   test("SPARK-10005 Schema merging for nested struct") {
-    val sqlContext = _sqlContext
-    import sqlContext.implicits._
-
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
@@ -229,4 +230,323 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("SPARK-10301 requested schema clipping - same schema") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 1L)))
+    }
+  }
+
+  // This test case is ignored because of parquet-mr bug PARQUET-370
+  ignore("SPARK-10301 requested schema clipping - schemas with disjoint sets of fields") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(null, null)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - requested schema contains physical schema") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true)
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 1L, null, null)))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(1).selectExpr("NAMED_STRUCT('a', id, 'd', id + 3) AS s").coalesce(1)
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true)
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, null, null, 3L)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - physical schema contains requested schema") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2, 'd', id + 3) AS s")
+        .coalesce(1)
+
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("b", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 1L)))
+    }
+
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2, 'd', id + 3) AS s")
+        .coalesce(1)
+
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("a", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(0L, 3L)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - schemas overlap but don't contain each other") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      df.write.parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("b", LongType, nullable = true)
+              .add("c", LongType, nullable = true)
+              .add("d", LongType, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(1L, 2L, null)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - deeply nested struct") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', ARRAY(NAMED_STRUCT('b', id, 'c', id))) AS s")
+        .coalesce(1)
+
+      df.write.parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s",
+          new StructType()
+            .add(
+              "a",
+              ArrayType(
+                new StructType()
+                  .add("b", LongType, nullable = true)
+                  .add("d", StringType, nullable = true),
+                containsNull = true),
+              nullable = true),
+          nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(Seq(Row(0, null)))))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - out of order") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df1 = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      val df2 = sqlContext
+        .range(1, 2)
+        .selectExpr("NAMED_STRUCT('c', id + 2, 'b', id + 1, 'd', id + 3) AS s")
+        .coalesce(1)
+
+      df1.write.parquet(path)
+      df2.write.mode(SaveMode.Append).parquet(path)
+
+      val userDefinedSchema = new StructType()
+        .add("s",
+          new StructType()
+            .add("a", LongType, nullable = true)
+            .add("b", LongType, nullable = true)
+            .add("d", LongType, nullable = true),
+          nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Seq(
+          Row(Row(0, 1, null)),
+          Row(Row(null, 2, 4))))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - schema merging") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df1 = sqlContext
+        .range(1)
+        .selectExpr("NAMED_STRUCT('a', id, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      val df2 = sqlContext
+        .range(1, 2)
+        .selectExpr("NAMED_STRUCT('a', id, 'b', id + 1, 'c', id + 2) AS s")
+        .coalesce(1)
+
+      df1.write.mode(SaveMode.Append).parquet(path)
+      df2.write.mode(SaveMode.Append).parquet(path)
+
+      checkAnswer(
+        sqlContext
+          .read
+          .option("mergeSchema", "true")
+          .parquet(path)
+          .selectExpr("s.a", "s.b", "s.c"),
+        Seq(
+          Row(0, null, 2),
+          Row(1, 2, 3)))
+    }
+  }
+
+  test("SPARK-10301 requested schema clipping - UDT") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      val df = sqlContext
+        .range(1)
+        .selectExpr(
+          """NAMED_STRUCT(
+            |  'f0', CAST(id AS STRING),
+            |  'f1', NAMED_STRUCT(
+            |    'a', CAST(id + 1 AS INT),
+            |    'b', CAST(id + 2 AS LONG),
+            |    'c', CAST(id + 3.5 AS DOUBLE)
+            |  )
+            |) AS s
+          """.stripMargin)
+        .coalesce(1)
+
+      df.write.mode(SaveMode.Append).parquet(path)
+
+      val userDefinedSchema =
+        new StructType()
+          .add(
+            "s",
+            new StructType()
+              .add("f1", new NestedStructUDT, nullable = true),
+            nullable = true)
+
+      checkAnswer(
+        sqlContext.read.schema(userDefinedSchema).parquet(path),
+        Row(Row(NestedStruct(1, 2L, 3.5D))))
+    }
+  }
+}
+
+object TestingUDT {
+  @SQLUserDefinedType(udt = classOf[NestedStructUDT])
+  case class NestedStruct(a: Integer, b: Long, c: Double)
+
+  class NestedStructUDT extends UserDefinedType[NestedStruct] {
+    override def sqlType: DataType =
+      new StructType()
+        .add("a", IntegerType, nullable = true)
+        .add("b", LongType, nullable = false)
+        .add("c", DoubleType, nullable = false)
+
+    override def serialize(obj: Any): Any = {
+      val row = new SpecificMutableRow(sqlType.asInstanceOf[StructType].map(_.dataType))
+      obj match {
+        case n: NestedStruct =>
+          row.setInt(0, n.a)
+          row.setLong(1, n.b)
+          row.setDouble(2, n.c)
+      }
+    }
+
+    override def userClass: Class[NestedStruct] = classOf[NestedStruct]
+
+    override def deserialize(datum: Any): NestedStruct = {
+      datum match {
+        case row: InternalRow =>
+          NestedStruct(row.getInt(0), row.getLong(1), row.getDouble(2))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index 9dcbc1a047bea..5a8f772c32289 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.parquet.schema.MessageTypeParser
 
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -196,7 +197,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional group _1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 array;
       |    }
       |  }
       |}
@@ -265,7 +266,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |        optional binary _1 (UTF8);
       |        optional group _2 (LIST) {
       |          repeated group bag {
-      |            optional group array_element {
+      |            optional group array {
       |              required int32 _1;
       |              required double _2;
       |            }
@@ -644,7 +645,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional group f1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 array;
       |    }
       |  }
       |}
@@ -941,4 +942,531 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3));
       |}
     """.stripMargin)
+
+  private def testSchemaClipping(
+      testName: String,
+      parquetSchema: String,
+      catalystSchema: StructType,
+      expectedSchema: String): Unit = {
+    test(s"Clipping - $testName") {
+      val expected = MessageTypeParser.parseMessageType(expectedSchema)
+      val actual = CatalystReadSupport.clipParquetSchema(
+        MessageTypeParser.parseMessageType(parquetSchema), catalystSchema)
+
+      try {
+        expected.checkContains(actual)
+        actual.checkContains(expected)
+      } catch { case cause: Throwable =>
+        fail(
+          s"""Expected clipped schema:
+             |$expected
+             |Actual clipped schema:
+             |$actual
+           """.stripMargin,
+          cause)
+      }
+    }
+  }
+
+  testSchemaClipping(
+    "simple nested struct",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional int32 f00;
+        |    optional int32 f01;
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f0Type = new StructType().add("f00", IntegerType, nullable = true)
+      new StructType()
+        .add("f0", f0Type, nullable = false)
+        .add("f1", IntegerType, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional int32 f00;
+        |  }
+        |  optional int32 f1;
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-protobuf style array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    repeated binary f00 (UTF8);
+        |    repeated group f01 {
+        |      optional int32 f010;
+        |      optional double f011;
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f00Type = ArrayType(StringType, containsNull = false)
+      val f01Type = ArrayType(
+        new StructType()
+          .add("f011", DoubleType, nullable = true),
+        containsNull = false)
+
+      val f0Type = new StructType()
+        .add("f00", f00Type, nullable = false)
+        .add("f01", f01Type, nullable = false)
+      val f1Type = ArrayType(IntegerType, containsNull = true)
+
+      new StructType()
+        .add("f0", f0Type, nullable = false)
+        .add("f1", f1Type, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    repeated binary f00 (UTF8);
+        |    repeated group f01 {
+        |      optional double f011;
+        |    }
+        |  }
+        |
+        |  optional group f1 (LIST) {
+        |    repeated group list {
+        |      optional int32 element;
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-thrift style array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated binary f00_tuple (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group f01_tuple {
+        |        optional int32 f010;
+        |        optional double f011;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f01ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated binary f00_tuple (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group f01_tuple {
+        |        optional double f011;
+        |        optional int64 f012;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-avro style array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated binary array (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group array {
+        |        optional int32 f010;
+        |        optional double f011;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f01ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated binary array (UTF8);
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group array {
+        |        optional double f011;
+        |        optional int64 f012;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-hive style array",
+
+    parquetSchema =
+      """message root {
+        |  optional group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group bag {
+        |        optional binary array_element;
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group bag {
+        |        optional group array_element {
+        |          optional int32 f010;
+        |          optional double f011;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f01ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = true), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = true), nullable = true)
+
+      new StructType().add("f0", f0Type, nullable = true)
+    },
+
+    expectedSchema =
+      """message root {
+        |  optional group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group bag {
+        |        optional binary array_element;
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group bag {
+        |        optional group array_element {
+        |          optional double f011;
+        |          optional int64 f012;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "2-level list of required struct",
+
+    parquetSchema =
+      s"""message root {
+         |  required group f0 {
+         |    required group f00 (LIST) {
+         |      repeated group element {
+         |        required int32 f000;
+         |        optional int64 f001;
+         |      }
+         |    }
+         |  }
+         |}
+       """.stripMargin,
+
+    catalystSchema = {
+      val f00ElementType =
+        new StructType()
+          .add("f001", LongType, nullable = true)
+          .add("f002", DoubleType, nullable = false)
+
+      val f00Type = ArrayType(f00ElementType, containsNull = false)
+      val f0Type = new StructType().add("f00", f00Type, nullable = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      s"""message root {
+         |  required group f0 {
+         |    required group f00 (LIST) {
+         |      repeated group element {
+         |        optional int64 f001;
+         |        required double f002;
+         |      }
+         |    }
+         |  }
+         |}
+       """.stripMargin)
+
+  testSchemaClipping(
+    "standard array",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group list {
+        |        required binary element (UTF8);
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group list {
+        |        required group element {
+        |          optional int32 f010;
+        |          optional double f011;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val f01ElementType = new StructType()
+        .add("f011", DoubleType, nullable = true)
+        .add("f012", LongType, nullable = true)
+
+      val f0Type = new StructType()
+        .add("f00", ArrayType(StringType, containsNull = false), nullable = true)
+        .add("f01", ArrayType(f01ElementType, containsNull = false), nullable = true)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional group f00 (LIST) {
+        |      repeated group list {
+        |        required binary element (UTF8);
+        |      }
+        |    }
+        |
+        |    optional group f01 (LIST) {
+        |      repeated group list {
+        |        required group element {
+        |          optional double f011;
+        |          optional int64 f012;
+        |        }
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "empty requested schema",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    required int32 f00;
+        |    required int64 f01;
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = new StructType(),
+
+    expectedSchema = "message root {}")
+
+  testSchemaClipping(
+    "disjoint field sets",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 {
+        |    required int32 f00;
+        |    required int64 f01;
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema =
+      new StructType()
+        .add(
+          "f0",
+          new StructType()
+            .add("f02", FloatType, nullable = true)
+            .add("f03", DoubleType, nullable = true),
+          nullable = true),
+
+    expectedSchema =
+      """message root {
+        |  required group f0 {
+        |    optional float f02;
+        |    optional double f03;
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "parquet-avro style map",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group map (MAP_KEY_VALUE) {
+        |      required int32 key;
+        |      required group value {
+        |        required int32 value_f0;
+        |        required int64 value_f1;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val valueType =
+        new StructType()
+          .add("value_f1", LongType, nullable = false)
+          .add("value_f2", DoubleType, nullable = false)
+
+      val f0Type = MapType(IntegerType, valueType, valueContainsNull = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group map (MAP_KEY_VALUE) {
+        |      required int32 key;
+        |      required group value {
+        |        required int64 value_f1;
+        |        required double value_f2;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "standard map",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required int32 key;
+        |      required group value {
+        |        required int32 value_f0;
+        |        required int64 value_f1;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val valueType =
+        new StructType()
+          .add("value_f1", LongType, nullable = false)
+          .add("value_f2", DoubleType, nullable = false)
+
+      val f0Type = MapType(IntegerType, valueType, valueContainsNull = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required int32 key;
+        |      required group value {
+        |        required int64 value_f1;
+        |        required double value_f2;
+        |      }
+        |    }
+        |  }
+        |}
+      """.stripMargin)
+
+  testSchemaClipping(
+    "standard map with complex key",
+
+    parquetSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required group key {
+        |        required int32 value_f0;
+        |        required int64 value_f1;
+        |      }
+        |      required int32 value;
+        |    }
+        |  }
+        |}
+      """.stripMargin,
+
+    catalystSchema = {
+      val keyType =
+        new StructType()
+          .add("value_f1", LongType, nullable = false)
+          .add("value_f2", DoubleType, nullable = false)
+
+      val f0Type = MapType(keyType, IntegerType, valueContainsNull = false)
+
+      new StructType().add("f0", f0Type, nullable = false)
+    },
+
+    expectedSchema =
+      """message root {
+        |  required group f0 (MAP) {
+        |    repeated group key_value {
+        |      required group key {
+        |        required int64 value_f1;
+        |        required double value_f2;
+        |      }
+        |      required int32 value;
+        |    }
+        |  }
+        |}
+      """.stripMargin)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
index 5dbc7d1630f27..442fafb12f200 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
@@ -33,7 +33,6 @@ import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
  * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
  */
 private[sql] trait ParquetTest extends SQLTestUtils {
-  protected def _sqlContext: SQLContext
 
   /**
    * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f`
@@ -43,7 +42,7 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       (data: Seq[T])
       (f: String => Unit): Unit = {
     withTempPath { file =>
-      _sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath)
+      sqlContext.createDataFrame(data).write.parquet(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
@@ -55,7 +54,7 @@ private[sql] trait ParquetTest extends SQLTestUtils {
   protected def withParquetDataFrame[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
-    withParquetFile(data)(path => f(_sqlContext.read.parquet(path)))
+    withParquetFile(data)(path => f(sqlContext.read.parquet(path)))
   }
 
   /**
@@ -67,14 +66,14 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       (data: Seq[T], tableName: String)
       (f: => Unit): Unit = {
     withParquetDataFrame(data) { df =>
-      _sqlContext.registerDataFrameAsTable(df, tableName)
+      sqlContext.registerDataFrameAsTable(df, tableName)
       withTempTable(tableName)(f)
     }
   }
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
       data: Seq[T], path: File): Unit = {
-    _sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
+    sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
   }
 
   protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala
index b789c5a106e56..88a3d878f97fe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala
@@ -33,11 +33,9 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with Shar
        """.stripMargin)
 
     checkAnswer(sqlContext.read.parquet(parquetFilePath.toString), (0 until 10).map { i =>
-      def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
-
       val suits = Array("SPADES", "HEARTS", "DIAMONDS", "CLUBS")
 
-      Row(
+      val nonNullablePrimitiveValues = Seq(
         i % 2 == 0,
         i.toByte,
         (i + 1).toShort,
@@ -50,18 +48,15 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with Shar
         s"val_$i",
         s"val_$i",
         // Thrift ENUM values are converted to Parquet binaries containing UTF-8 strings
-        suits(i % 4),
-
-        nullable(i % 2 == 0: java.lang.Boolean),
-        nullable(i.toByte: java.lang.Byte),
-        nullable((i + 1).toShort: java.lang.Short),
-        nullable(i + 2: Integer),
-        nullable((i * 10).toLong: java.lang.Long),
-        nullable(i.toDouble + 0.2d: java.lang.Double),
-        nullable(s"val_$i"),
-        nullable(s"val_$i"),
-        nullable(suits(i % 4)),
+        suits(i % 4))
+
+      val nullablePrimitiveValues = if (i % 3 == 0) {
+        Seq.fill(nonNullablePrimitiveValues.length)(null)
+      } else {
+        nonNullablePrimitiveValues
+      }
 
+      val complexValues = Seq(
         Seq.tabulate(3)(n => s"arr_${i + n}"),
         // Thrift `SET`s are converted to Parquet `LIST`s
         Seq(i),
@@ -71,6 +66,83 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest with Shar
             Row(Seq.tabulate(3)(j => i + j + m), s"val_${i + m}")
           }
         }.toMap)
+
+      Row(nonNullablePrimitiveValues ++ nullablePrimitiveValues ++ complexValues: _*)
     })
   }
+
+  test("SPARK-10136 list of primitive list") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      // This Parquet schema is translated from the following Thrift schema:
+      //
+      //   struct ListOfPrimitiveList {
+      //     1: list<list<i32>> f;
+      //   }
+      val schema =
+        s"""message ListOfPrimitiveList {
+           |  required group f (LIST) {
+           |    repeated group f_tuple (LIST) {
+           |      repeated int32 f_tuple_tuple;
+           |    }
+           |  }
+           |}
+         """.stripMargin
+
+      writeDirect(path, schema, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.group {
+              rc.field("f_tuple", 0) {
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(0)
+                    rc.addInteger(1)
+                  }
+                }
+
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(2)
+                    rc.addInteger(3)
+                  }
+                }
+              }
+            }
+          }
+        }
+      }, { rc =>
+        rc.message {
+          rc.field("f", 0) {
+            rc.group {
+              rc.field("f_tuple", 0) {
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(4)
+                    rc.addInteger(5)
+                  }
+                }
+
+                rc.group {
+                  rc.field("f_tuple_tuple", 0) {
+                    rc.addInteger(6)
+                    rc.addInteger(7)
+                  }
+                }
+              }
+            }
+          }
+        }
+      })
+
+      logParquetSchema(path)
+
+      checkAnswer(
+        sqlContext.read.parquet(path),
+        Seq(
+          Row(Seq(Seq(0, 1), Seq(2, 3))),
+          Row(Seq(Seq(4, 5), Seq(6, 7)))))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 53a0e53fd7719..dcbfdca71acb6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -33,8 +33,7 @@ import org.apache.spark.sql.{SQLConf, SQLContext, QueryTest}
  * without serializing the hashed relation, which does not happen in local mode.
  */
 class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
-  private var sc: SparkContext = null
-  private var sqlContext: SQLContext = null
+  protected var sqlContext: SQLContext = null
 
   /**
    * Create a new [[SQLContext]] running in local-cluster mode with unsafe and codegen enabled.
@@ -44,15 +43,14 @@ class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
     val conf = new SparkConf()
       .setMaster("local-cluster[2,1,1024]")
       .setAppName("testing")
-    sc = new SparkContext(conf)
+    val sc = new SparkContext(conf)
     sqlContext = new SQLContext(sc)
     sqlContext.setConf(SQLConf.UNSAFE_ENABLED, true)
     sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
   }
 
   override def afterAll(): Unit = {
-    sc.stop()
-    sc = null
+    sqlContext.sparkContext.stop()
     sqlContext = null
   }
 
@@ -60,7 +58,7 @@ class BroadcastJoinSuite extends QueryTest with BeforeAndAfterAll {
    * Test whether the specified broadcast join updates the peak execution memory accumulator.
    */
   private def testBroadcastJoin[T: ClassTag](name: String, joinType: String): Unit = {
-    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, name) {
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sqlContext.sparkContext, name) {
       val df1 = sqlContext.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value")
       val df2 = sqlContext.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value")
       // Comparison at the end is for broadcast left semi join
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 4c9187a9a7106..e5fd9e277fc61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -37,7 +37,7 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
 
   test("GeneralHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
-    val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data")
+    val numDataRows = SQLMetrics.createLongMetric(sparkContext, "data")
     val hashed = HashedRelation(data.iterator, numDataRows, keyProjection)
     assert(hashed.isInstanceOf[GeneralHashedRelation])
 
@@ -53,7 +53,7 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
 
   test("UniqueKeyHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2))
-    val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data")
+    val numDataRows = SQLMetrics.createLongMetric(sparkContext, "data")
     val hashed = HashedRelation(data.iterator, numDataRows, keyProjection)
     assert(hashed.isInstanceOf[UniqueKeyHashedRelation])
 
@@ -73,7 +73,7 @@ class HashedRelationSuite extends SparkFunSuite with SharedSQLContext {
   test("UnsafeHashedRelation") {
     val schema = StructType(StructField("a", IntegerType, true) :: Nil)
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
-    val numDataRows = SQLMetrics.createLongMetric(ctx.sparkContext, "data")
+    val numDataRows = SQLMetrics.createLongMetric(sparkContext, "data")
     val toUnsafe = UnsafeProjection.create(schema)
     val unsafeData = data.map(toUnsafe(_).copy()).toArray
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
index cc649b9bd4c45..4174ee055021d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
@@ -27,9 +27,10 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
 class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
+  import testImplicits.localSeqToDataFrameHolder
 
-  private lazy val myUpperCaseData = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val myUpperCaseData = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, "A"),
       Row(2, "B"),
       Row(3, "C"),
@@ -39,8 +40,8 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(null, "G")
     )), new StructType().add("N", IntegerType).add("L", StringType))
 
-  private lazy val myLowerCaseData = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val myLowerCaseData = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, "a"),
       Row(2, "b"),
       Row(3, "c"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
index a1a617d7b7398..09e0237a7cc50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -28,8 +28,8 @@ import org.apache.spark.sql.types.{IntegerType, DoubleType, StructType}
 
 class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
 
-  private lazy val left = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val left = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, 2.0),
       Row(2, 100.0),
       Row(2, 1.0), // This row is duplicated to ensure that we will have multiple buffered matches
@@ -40,8 +40,8 @@ class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(null, null)
     )), new StructType().add("a", IntegerType).add("b", DoubleType))
 
-  private lazy val right = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val right = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(0, 0.0),
       Row(2, 3.0), // This row is duplicated to ensure that we will have multiple buffered matches
       Row(2, -1.0),
@@ -76,37 +76,37 @@ class OuterJoinSuite extends SparkPlanTest with SharedSQLContext {
 
     test(s"$testName using ShuffledHashOuterJoin") {
       extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-          withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-            checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-              EnsureRequirements(sqlContext).apply(
-                ShuffledHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
-              expectedAnswer.map(Row.fromTuple),
-              sortAnswers = true)
-          }
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+            EnsureRequirements(sqlContext).apply(
+              ShuffledHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
+            expectedAnswer.map(Row.fromTuple),
+            sortAnswers = true)
+        }
       }
     }
 
     if (joinType != FullOuter) {
       test(s"$testName using BroadcastHashOuterJoin") {
         extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-            withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-              checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-                BroadcastHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right),
-                expectedAnswer.map(Row.fromTuple),
-                sortAnswers = true)
-            }
+          withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+            checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+              BroadcastHashOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right),
+              expectedAnswer.map(Row.fromTuple),
+              sortAnswers = true)
+          }
         }
       }
+    }
 
-      test(s"$testName using SortMergeOuterJoin") {
-        extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
-            withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-              checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-                EnsureRequirements(sqlContext).apply(
-                  SortMergeOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
-                expectedAnswer.map(Row.fromTuple),
-                sortAnswers = false)
-            }
+    test(s"$testName using SortMergeOuterJoin") {
+      extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _) =>
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+            EnsureRequirements(sqlContext).apply(
+              SortMergeOuterJoin(leftKeys, rightKeys, joinType, boundCondition, left, right)),
+            expectedAnswer.map(Row.fromTuple),
+            sortAnswers = true)
         }
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
index baa86e320d986..3afd762942bcf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
@@ -28,8 +28,8 @@ import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}
 
 class SemiJoinSuite extends SparkPlanTest with SharedSQLContext {
 
-  private lazy val left = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val left = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(1, 2.0),
       Row(1, 2.0),
       Row(2, 1.0),
@@ -40,8 +40,8 @@ class SemiJoinSuite extends SparkPlanTest with SharedSQLContext {
       Row(6, null)
     )), new StructType().add("a", IntegerType).add("b", DoubleType))
 
-  private lazy val right = ctx.createDataFrame(
-    ctx.sparkContext.parallelize(Seq(
+  private lazy val right = sqlContext.createDataFrame(
+    sparkContext.parallelize(Seq(
       Row(2, 3.0),
       Row(2, 3.0),
       Row(3, 2.0),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala
new file mode 100644
index 0000000000000..efc3227dd60d8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/DummyNode.scala
@@ -0,0 +1,68 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+
+/**
+ * A dummy [[LocalNode]] that just returns rows from a [[LocalRelation]].
+ */
+private[local] case class DummyNode(
+    output: Seq[Attribute],
+    relation: LocalRelation,
+    conf: SQLConf)
+  extends LocalNode(conf) {
+
+  import DummyNode._
+
+  private var index: Int = CLOSED
+  private val input: Seq[InternalRow] = relation.data
+
+  def this(output: Seq[Attribute], data: Seq[Product], conf: SQLConf = new SQLConf) {
+    this(output, LocalRelation.fromProduct(output, data), conf)
+  }
+
+  def isOpen: Boolean = index != CLOSED
+
+  override def children: Seq[LocalNode] = Seq.empty
+
+  override def open(): Unit = {
+    index = -1
+  }
+
+  override def next(): Boolean = {
+    index += 1
+    index < input.size
+  }
+
+  override def fetch(): InternalRow = {
+    assert(index >= 0 && index < input.size)
+    input(index)
+  }
+
+  override def close(): Unit = {
+    index = CLOSED
+  }
+}
+
+private object DummyNode {
+  val CLOSED: Int = Int.MinValue
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
new file mode 100644
index 0000000000000..bbd94d8da2d11
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ExpandNodeSuite.scala
@@ -0,0 +1,49 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+
+class ExpandNodeSuite extends LocalNodeTest {
+
+  private def testExpand(inputData: Array[(Int, Int)] = Array.empty): Unit = {
+    val inputNode = new DummyNode(kvIntAttributes, inputData)
+    val projections = Seq(Seq('k + 'v, 'k - 'v), Seq('k * 'v, 'k / 'v))
+    val expandNode = new ExpandNode(conf, projections, inputNode.output, inputNode)
+    val resolvedNode = resolveExpressions(expandNode)
+    val expectedOutput = {
+      val firstHalf = inputData.map { case (k, v) => (k + v, k - v) }
+      val secondHalf = inputData.map { case (k, v) => (k * v, k / v) }
+      firstHalf ++ secondHalf
+    }
+    val actualOutput = resolvedNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput.toSet === expectedOutput.toSet)
+  }
+
+  test("empty") {
+    testExpand()
+  }
+
+  test("basic") {
+    testExpand((1 to 100).map { i => (i, i * 1000) }.toArray)
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
new file mode 100644
index 0000000000000..4eadce646d379
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/FilterNodeSuite.scala
@@ -0,0 +1,45 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+
+class FilterNodeSuite extends LocalNodeTest {
+
+  private def testFilter(inputData: Array[(Int, Int)] = Array.empty): Unit = {
+    val cond = 'k % 2 === 0
+    val inputNode = new DummyNode(kvIntAttributes, inputData)
+    val filterNode = new FilterNode(conf, cond, inputNode)
+    val resolvedNode = resolveExpressions(filterNode)
+    val expectedOutput = inputData.filter { case (k, _) => k % 2 == 0 }
+    val actualOutput = resolvedNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
+  }
+
+  test("empty") {
+    testFilter()
+  }
+
+  test("basic") {
+    testFilter((1 to 100).map { i => (i, i) }.toArray)
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
new file mode 100644
index 0000000000000..5c1bdb088eeed
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/HashJoinNodeSuite.scala
@@ -0,0 +1,97 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
+
+
+class HashJoinNodeSuite extends LocalNodeTest {
+
+  // Test all combinations of the two dimensions: with/out unsafe and build sides
+  private val maybeUnsafeAndCodegen = Seq(false, true)
+  private val buildSides = Seq(BuildLeft, BuildRight)
+  maybeUnsafeAndCodegen.foreach { unsafeAndCodegen =>
+    buildSides.foreach { buildSide =>
+      testJoin(unsafeAndCodegen, buildSide)
+    }
+  }
+
+  /**
+   * Test inner hash join with varying degrees of matches.
+   */
+  private def testJoin(
+      unsafeAndCodegen: Boolean,
+      buildSide: BuildSide): Unit = {
+    val simpleOrUnsafe = if (!unsafeAndCodegen) "simple" else "unsafe"
+    val testNamePrefix = s"$simpleOrUnsafe / $buildSide"
+    val someData = (1 to 100).map { i => (i, "burger" + i) }.toArray
+    val conf = new SQLConf
+    conf.setConf(SQLConf.UNSAFE_ENABLED, unsafeAndCodegen)
+    conf.setConf(SQLConf.CODEGEN_ENABLED, unsafeAndCodegen)
+
+    // Actual test body
+    def runTest(leftInput: Array[(Int, String)], rightInput: Array[(Int, String)]): Unit = {
+      val rightInputMap = rightInput.toMap
+      val leftNode = new DummyNode(joinNameAttributes, leftInput)
+      val rightNode = new DummyNode(joinNicknameAttributes, rightInput)
+      val makeNode = (node1: LocalNode, node2: LocalNode) => {
+        resolveExpressions(new HashJoinNode(
+          conf, Seq('id1), Seq('id2), buildSide, node1, node2))
+      }
+      val makeUnsafeNode = if (unsafeAndCodegen) wrapForUnsafe(makeNode) else makeNode
+      val hashJoinNode = makeUnsafeNode(leftNode, rightNode)
+      val expectedOutput = leftInput
+        .filter { case (k, _) => rightInputMap.contains(k) }
+        .map { case (k, v) => (k, v, k, rightInputMap(k)) }
+      val actualOutput = hashJoinNode.collect().map { row =>
+        // (id, name, id, nickname)
+        (row.getInt(0), row.getString(1), row.getInt(2), row.getString(3))
+      }
+      assert(actualOutput === expectedOutput)
+    }
+
+    test(s"$testNamePrefix: empty") {
+      runTest(Array.empty, Array.empty)
+      runTest(someData, Array.empty)
+      runTest(Array.empty, someData)
+    }
+
+    test(s"$testNamePrefix: no matches") {
+      val someIrrelevantData = (10000 to 100100).map { i => (i, "piper" + i) }.toArray
+      runTest(someData, Array.empty)
+      runTest(Array.empty, someData)
+      runTest(someData, someIrrelevantData)
+      runTest(someIrrelevantData, someData)
+    }
+
+    test(s"$testNamePrefix: partial matches") {
+      val someOtherData = (50 to 150).map { i => (i, "finnegan" + i) }.toArray
+      runTest(someData, someOtherData)
+      runTest(someOtherData, someData)
+    }
+
+    test(s"$testNamePrefix: full matches") {
+      val someSuperRelevantData = someData.map { case (k, v) => (k, "cooper" + v) }.toArray
+      runTest(someData, someSuperRelevantData)
+      runTest(someSuperRelevantData, someData)
+    }
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
new file mode 100644
index 0000000000000..c0ad2021b204a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/IntersectNodeSuite.scala
@@ -0,0 +1,37 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+
+class IntersectNodeSuite extends LocalNodeTest {
+
+  test("basic") {
+    val n = 100
+    val leftData = (1 to n).filter { i => i % 2 == 0 }.map { i => (i, i) }.toArray
+    val rightData = (1 to n).filter { i => i % 3 == 0 }.map { i => (i, i) }.toArray
+    val leftNode = new DummyNode(kvIntAttributes, leftData)
+    val rightNode = new DummyNode(kvIntAttributes, rightData)
+    val intersectNode = new IntersectNode(conf, leftNode, rightNode)
+    val expectedOutput = leftData.intersect(rightData)
+    val actualOutput = intersectNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
new file mode 100644
index 0000000000000..fb790636a3689
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LimitNodeSuite.scala
@@ -0,0 +1,41 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+
+class LimitNodeSuite extends LocalNodeTest {
+
+  private def testLimit(inputData: Array[(Int, Int)] = Array.empty, limit: Int = 10): Unit = {
+    val inputNode = new DummyNode(kvIntAttributes, inputData)
+    val limitNode = new LimitNode(conf, limit, inputNode)
+    val expectedOutput = inputData.take(limit)
+    val actualOutput = limitNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
+  }
+
+  test("empty") {
+    testLimit()
+  }
+
+  test("basic") {
+    testLimit((1 to 100).map { i => (i, i) }.toArray, 20)
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
new file mode 100644
index 0000000000000..0d1ed99eec6cd
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeSuite.scala
@@ -0,0 +1,73 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+
+class LocalNodeSuite extends LocalNodeTest {
+  private val data = (1 to 100).map { i => (i, i) }.toArray
+
+  test("basic open, next, fetch, close") {
+    val node = new DummyNode(kvIntAttributes, data)
+    assert(!node.isOpen)
+    node.open()
+    assert(node.isOpen)
+    data.foreach { case (k, v) =>
+      assert(node.next())
+      // fetch should be idempotent
+      val fetched = node.fetch()
+      assert(node.fetch() === fetched)
+      assert(node.fetch() === fetched)
+      assert(node.fetch().numFields === 2)
+      assert(node.fetch().getInt(0) === k)
+      assert(node.fetch().getInt(1) === v)
+    }
+    assert(!node.next())
+    node.close()
+    assert(!node.isOpen)
+  }
+
+  test("asIterator") {
+    val node = new DummyNode(kvIntAttributes, data)
+    val iter = node.asIterator
+    node.open()
+    data.foreach { case (k, v) =>
+      // hasNext should be idempotent
+      assert(iter.hasNext)
+      assert(iter.hasNext)
+      val item = iter.next()
+      assert(item.numFields === 2)
+      assert(item.getInt(0) === k)
+      assert(item.getInt(1) === v)
+    }
+    intercept[NoSuchElementException] {
+      iter.next()
+    }
+    node.close()
+  }
+
+  test("collect") {
+    val node = new DummyNode(kvIntAttributes, data)
+    node.open()
+    val collected = node.collect()
+    assert(collected.size === data.size)
+    assert(collected.forall(_.size === 2))
+    assert(collected.map { case row => (row.getInt(0), row.getInt(0)) } === data)
+    node.close()
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
new file mode 100644
index 0000000000000..098050bcd2236
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/LocalNodeTest.scala
@@ -0,0 +1,70 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.types.{IntegerType, StringType}
+
+
+class LocalNodeTest extends SparkFunSuite {
+
+  protected val conf: SQLConf = new SQLConf
+  protected val kvIntAttributes = Seq(
+    AttributeReference("k", IntegerType)(),
+    AttributeReference("v", IntegerType)())
+  protected val joinNameAttributes = Seq(
+    AttributeReference("id1", IntegerType)(),
+    AttributeReference("name", StringType)())
+  protected val joinNicknameAttributes = Seq(
+    AttributeReference("id2", IntegerType)(),
+    AttributeReference("nickname", StringType)())
+
+  /**
+   * Wrap a function processing two [[LocalNode]]s such that:
+   *   (1) all input rows are automatically converted to unsafe rows
+   *   (2) all output rows are automatically converted back to safe rows
+   */
+  protected def wrapForUnsafe(
+      f: (LocalNode, LocalNode) => LocalNode): (LocalNode, LocalNode) => LocalNode = {
+    (left: LocalNode, right: LocalNode) => {
+      val _left = ConvertToUnsafeNode(conf, left)
+      val _right = ConvertToUnsafeNode(conf, right)
+      val r = f(_left, _right)
+      ConvertToSafeNode(conf, r)
+    }
+  }
+
+  /**
+   * Recursively resolve all expressions in a [[LocalNode]] using the node's attributes.
+   */
+  protected def resolveExpressions(outputNode: LocalNode): LocalNode = {
+    outputNode transform {
+      case node: LocalNode =>
+        val inputMap = node.output.map { a => (a.name, a) }.toMap
+        node transformExpressions {
+          case UnresolvedAttribute(Seq(u)) =>
+            inputMap.getOrElse(u,
+              sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+        }
+    }
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
new file mode 100644
index 0000000000000..40299d9d5ee37
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/NestedLoopJoinNodeSuite.scala
@@ -0,0 +1,145 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
+
+
+class NestedLoopJoinNodeSuite extends LocalNodeTest {
+
+  // Test all combinations of the three dimensions: with/out unsafe, build sides, and join types
+  private val maybeUnsafeAndCodegen = Seq(false, true)
+  private val buildSides = Seq(BuildLeft, BuildRight)
+  private val joinTypes = Seq(LeftOuter, RightOuter, FullOuter)
+  maybeUnsafeAndCodegen.foreach { unsafeAndCodegen =>
+    buildSides.foreach { buildSide =>
+      joinTypes.foreach { joinType =>
+        testJoin(unsafeAndCodegen, buildSide, joinType)
+      }
+    }
+  }
+
+  /**
+   * Test outer nested loop joins with varying degrees of matches.
+   */
+  private def testJoin(
+      unsafeAndCodegen: Boolean,
+      buildSide: BuildSide,
+      joinType: JoinType): Unit = {
+    val simpleOrUnsafe = if (!unsafeAndCodegen) "simple" else "unsafe"
+    val testNamePrefix = s"$simpleOrUnsafe / $buildSide / $joinType"
+    val someData = (1 to 100).map { i => (i, "burger" + i) }.toArray
+    val conf = new SQLConf
+    conf.setConf(SQLConf.UNSAFE_ENABLED, unsafeAndCodegen)
+    conf.setConf(SQLConf.CODEGEN_ENABLED, unsafeAndCodegen)
+
+    // Actual test body
+    def runTest(
+        joinType: JoinType,
+        leftInput: Array[(Int, String)],
+        rightInput: Array[(Int, String)]): Unit = {
+      val leftNode = new DummyNode(joinNameAttributes, leftInput)
+      val rightNode = new DummyNode(joinNicknameAttributes, rightInput)
+      val cond = 'id1 === 'id2
+      val makeNode = (node1: LocalNode, node2: LocalNode) => {
+        resolveExpressions(
+          new NestedLoopJoinNode(conf, node1, node2, buildSide, joinType, Some(cond)))
+      }
+      val makeUnsafeNode = if (unsafeAndCodegen) wrapForUnsafe(makeNode) else makeNode
+      val hashJoinNode = makeUnsafeNode(leftNode, rightNode)
+      val expectedOutput = generateExpectedOutput(leftInput, rightInput, joinType)
+      val actualOutput = hashJoinNode.collect().map { row =>
+        // (id, name, id, nickname)
+        (row.getInt(0), row.getString(1), row.getInt(2), row.getString(3))
+      }
+      assert(actualOutput.toSet === expectedOutput.toSet)
+    }
+
+    test(s"$testNamePrefix: empty") {
+      runTest(joinType, Array.empty, Array.empty)
+    }
+
+    test(s"$testNamePrefix: no matches") {
+      val someIrrelevantData = (10000 to 10100).map { i => (i, "piper" + i) }.toArray
+      runTest(joinType, someData, Array.empty)
+      runTest(joinType, Array.empty, someData)
+      runTest(joinType, someData, someIrrelevantData)
+      runTest(joinType, someIrrelevantData, someData)
+    }
+
+    test(s"$testNamePrefix: partial matches") {
+      val someOtherData = (50 to 150).map { i => (i, "finnegan" + i) }.toArray
+      runTest(joinType, someData, someOtherData)
+      runTest(joinType, someOtherData, someData)
+    }
+
+    test(s"$testNamePrefix: full matches") {
+      val someSuperRelevantData = someData.map { case (k, v) => (k, "cooper" + v) }
+      runTest(joinType, someData, someSuperRelevantData)
+      runTest(joinType, someSuperRelevantData, someData)
+    }
+  }
+
+  /**
+   * Helper method to generate the expected output of a test based on the join type.
+   */
+  private def generateExpectedOutput(
+      leftInput: Array[(Int, String)],
+      rightInput: Array[(Int, String)],
+      joinType: JoinType): Array[(Int, String, Int, String)] = {
+    joinType match {
+      case LeftOuter =>
+        val rightInputMap = rightInput.toMap
+        leftInput.map { case (k, v) =>
+          val rightKey = rightInputMap.get(k).map { _ => k }.getOrElse(0)
+          val rightValue = rightInputMap.getOrElse(k, null)
+          (k, v, rightKey, rightValue)
+        }
+
+      case RightOuter =>
+        val leftInputMap = leftInput.toMap
+        rightInput.map { case (k, v) =>
+          val leftKey = leftInputMap.get(k).map { _ => k }.getOrElse(0)
+          val leftValue = leftInputMap.getOrElse(k, null)
+          (leftKey, leftValue, k, v)
+        }
+
+      case FullOuter =>
+        val leftInputMap = leftInput.toMap
+        val rightInputMap = rightInput.toMap
+        val leftOutput = leftInput.map { case (k, v) =>
+          val rightKey = rightInputMap.get(k).map { _ => k }.getOrElse(0)
+          val rightValue = rightInputMap.getOrElse(k, null)
+          (k, v, rightKey, rightValue)
+        }
+        val rightOutput = rightInput.map { case (k, v) =>
+          val leftKey = leftInputMap.get(k).map { _ => k }.getOrElse(0)
+          val leftValue = leftInputMap.getOrElse(k, null)
+          (leftKey, leftValue, k, v)
+        }
+        (leftOutput ++ rightOutput).distinct
+
+      case other =>
+        throw new IllegalArgumentException(s"Join type $other is not applicable")
+    }
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
new file mode 100644
index 0000000000000..02ecb23d34b2f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/ProjectNodeSuite.scala
@@ -0,0 +1,49 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, NamedExpression}
+import org.apache.spark.sql.types.{IntegerType, StringType}
+
+
+class ProjectNodeSuite extends LocalNodeTest {
+  private val pieAttributes = Seq(
+    AttributeReference("id", IntegerType)(),
+    AttributeReference("age", IntegerType)(),
+    AttributeReference("name", StringType)())
+
+  private def testProject(inputData: Array[(Int, Int, String)] = Array.empty): Unit = {
+    val inputNode = new DummyNode(pieAttributes, inputData)
+    val columns = Seq[NamedExpression](inputNode.output(0), inputNode.output(2))
+    val projectNode = new ProjectNode(conf, columns, inputNode)
+    val expectedOutput = inputData.map { case (id, age, name) => (id, name) }
+    val actualOutput = projectNode.collect().map { case row =>
+      (row.getInt(0), row.getString(1))
+    }
+    assert(actualOutput === expectedOutput)
+  }
+
+  test("empty") {
+    testProject()
+  }
+
+  test("basic") {
+    testProject((1 to 100).map { i => (i, i + 1, "pie" + i) }.toArray)
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
new file mode 100644
index 0000000000000..a3e83bbd51457
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/SampleNodeSuite.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
+
+
+class SampleNodeSuite extends LocalNodeTest {
+
+  private def testSample(withReplacement: Boolean): Unit = {
+    val seed = 0L
+    val lowerb = 0.0
+    val upperb = 0.3
+    val maybeOut = if (withReplacement) "" else "out"
+    test(s"with$maybeOut replacement") {
+      val inputData = (1 to 1000).map { i => (i, i) }.toArray
+      val inputNode = new DummyNode(kvIntAttributes, inputData)
+      val sampleNode = new SampleNode(conf, lowerb, upperb, withReplacement, seed, inputNode)
+      val sampler =
+        if (withReplacement) {
+          new PoissonSampler[(Int, Int)](upperb - lowerb, useGapSamplingIfPossible = false)
+        } else {
+          new BernoulliCellSampler[(Int, Int)](lowerb, upperb)
+        }
+      sampler.setSeed(seed)
+      val expectedOutput = sampler.sample(inputData.iterator).toArray
+      val actualOutput = sampleNode.collect().map { case row =>
+        (row.getInt(0), row.getInt(1))
+      }
+      assert(actualOutput === expectedOutput)
+    }
+  }
+
+  testSample(withReplacement = true)
+  testSample(withReplacement = false)
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
new file mode 100644
index 0000000000000..42ebc7bfcaadc
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/TakeOrderedAndProjectNodeSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.local
+
+import scala.util.Random
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.SortOrder
+
+
+class TakeOrderedAndProjectNodeSuite extends LocalNodeTest {
+
+  private def testTakeOrderedAndProject(desc: Boolean): Unit = {
+    val limit = 10
+    val ascOrDesc = if (desc) "desc" else "asc"
+    test(ascOrDesc) {
+      val inputData = Random.shuffle((1 to 100).toList).map { i => (i, i) }.toArray
+      val inputNode = new DummyNode(kvIntAttributes, inputData)
+      val firstColumn = inputNode.output(0)
+      val sortDirection = if (desc) Descending else Ascending
+      val sortOrder = SortOrder(firstColumn, sortDirection)
+      val takeOrderAndProjectNode = new TakeOrderedAndProjectNode(
+        conf, limit, Seq(sortOrder), Some(Seq(firstColumn)), inputNode)
+      val expectedOutput = inputData
+        .map { case (k, _) => k }
+        .sortBy { k => k * (if (desc) -1 else 1) }
+        .take(limit)
+      val actualOutput = takeOrderAndProjectNode.collect().map { row => row.getInt(0) }
+      assert(actualOutput === expectedOutput)
+    }
+  }
+
+  testTakeOrderedAndProject(desc = false)
+  testTakeOrderedAndProject(desc = true)
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
new file mode 100644
index 0000000000000..666b0235c061d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/local/UnionNodeSuite.scala
@@ -0,0 +1,55 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.local
+
+
+class UnionNodeSuite extends LocalNodeTest {
+
+  private def testUnion(inputData: Seq[Array[(Int, Int)]]): Unit = {
+    val inputNodes = inputData.map { data =>
+      new DummyNode(kvIntAttributes, data)
+    }
+    val unionNode = new UnionNode(conf, inputNodes)
+    val expectedOutput = inputData.flatten
+    val actualOutput = unionNode.collect().map { case row =>
+      (row.getInt(0), row.getInt(1))
+    }
+    assert(actualOutput === expectedOutput)
+  }
+
+  test("empty") {
+    testUnion(Seq(Array.empty))
+    testUnion(Seq(Array.empty, Array.empty))
+  }
+
+  test("self") {
+    val data = (1 to 100).map { i => (i, i) }.toArray
+    testUnion(Seq(data))
+    testUnion(Seq(data, data))
+    testUnion(Seq(data, data, data))
+  }
+
+  test("basic") {
+    val zero = Array.empty[(Int, Int)]
+    val one = (1 to 100).map { i => (i, i) }.toArray
+    val two = (50 to 150).map { i => (i, i) }.toArray
+    val three = (800 to 900).map { i => (i, i) }.toArray
+    testUnion(Seq(zero, one, two, three))
+  }
+
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 80006bf077fe8..6afffae161ef6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -36,7 +36,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
   import testImplicits._
 
   test("LongSQLMetric should not box Long") {
-    val l = SQLMetrics.createLongMetric(ctx.sparkContext, "long")
+    val l = SQLMetrics.createLongMetric(sparkContext, "long")
     val f = () => {
       l += 1L
       l.add(1L)
@@ -50,7 +50,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
 
   test("Normal accumulator should do boxing") {
     // We need this test to make sure BoxingFinder works.
-    val l = ctx.sparkContext.accumulator(0L)
+    val l = sparkContext.accumulator(0L)
     val f = () => { l += 1L }
     BoxingFinder.getClassReader(f.getClass).foreach { cl =>
       val boxingFinder = new BoxingFinder()
@@ -71,19 +71,19 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       df: DataFrame,
       expectedNumOfJobs: Int,
       expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = {
-    val previousExecutionIds = ctx.listener.executionIdToData.keySet
+    val previousExecutionIds = sqlContext.listener.executionIdToData.keySet
     df.collect()
-    ctx.sparkContext.listenerBus.waitUntilEmpty(10000)
-    val executionIds = ctx.listener.executionIdToData.keySet.diff(previousExecutionIds)
+    sparkContext.listenerBus.waitUntilEmpty(10000)
+    val executionIds = sqlContext.listener.executionIdToData.keySet.diff(previousExecutionIds)
     assert(executionIds.size === 1)
     val executionId = executionIds.head
-    val jobs = ctx.listener.getExecution(executionId).get.jobs
+    val jobs = sqlContext.listener.getExecution(executionId).get.jobs
     // Use "<=" because there is a race condition that we may miss some jobs
     // TODO Change it to "=" once we fix the race condition that missing the JobStarted event.
     assert(jobs.size <= expectedNumOfJobs)
     if (jobs.size == expectedNumOfJobs) {
       // If we can track all jobs, check the metric values
-      val metricValues = ctx.listener.getExecutionMetrics(executionId)
+      val metricValues = sqlContext.listener.getExecutionMetrics(executionId)
       val actualMetrics = SparkPlanGraph(df.queryExecution.executedPlan).nodes.filter { node =>
         expectedMetrics.contains(node.id)
       }.map { node =>
@@ -474,19 +474,19 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
 
   test("save metrics") {
     withTempPath { file =>
-      val previousExecutionIds = ctx.listener.executionIdToData.keySet
+      val previousExecutionIds = sqlContext.listener.executionIdToData.keySet
       // Assume the execution plan is
       // PhysicalRDD(nodeId = 0)
       person.select('name).write.format("json").save(file.getAbsolutePath)
-      ctx.sparkContext.listenerBus.waitUntilEmpty(10000)
-      val executionIds = ctx.listener.executionIdToData.keySet.diff(previousExecutionIds)
+      sparkContext.listenerBus.waitUntilEmpty(10000)
+      val executionIds = sqlContext.listener.executionIdToData.keySet.diff(previousExecutionIds)
       assert(executionIds.size === 1)
       val executionId = executionIds.head
-      val jobs = ctx.listener.getExecution(executionId).get.jobs
+      val jobs = sqlContext.listener.getExecution(executionId).get.jobs
       // Use "<=" because there is a race condition that we may miss some jobs
       // TODO Change "<=" to "=" once we fix the race condition that missing the JobStarted event.
       assert(jobs.size <= 1)
-      val metricValues = ctx.listener.getExecutionMetrics(executionId)
+      val metricValues = sqlContext.listener.getExecutionMetrics(executionId)
       // Because "save" will create a new DataFrame internally, we cannot get the real metric id.
       // However, we still can check the value.
       assert(metricValues.values.toSeq === Seq(2L))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 80d1e88956949..7a46c69a056b1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -54,9 +54,9 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     details = ""
   )
 
-  private def createTaskInfo(taskId: Int, attempt: Int): TaskInfo = new TaskInfo(
+  private def createTaskInfo(taskId: Int, attemptNumber: Int): TaskInfo = new TaskInfo(
     taskId = taskId,
-    attempt = attempt,
+    attemptNumber = attemptNumber,
     // The following fields are not used in tests
     index = 0,
     launchTime = 0,
@@ -74,7 +74,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("basic") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     val accumulatorIds =
@@ -212,7 +212,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -241,7 +241,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -281,7 +281,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobFailed)") {
-    val listener = new SQLListener(ctx)
+    val listener = new SQLListener(sqlContext)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 0edac0848c3bb..c4b039a9c5359 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -255,26 +255,26 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("Basic API") {
-    assert(ctx.read.jdbc(
+    assert(sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.PEOPLE", new Properties).collect().length === 3)
   }
 
   test("Basic API with FetchSize") {
     val properties = new Properties
     properties.setProperty("fetchSize", "2")
-    assert(ctx.read.jdbc(
+    assert(sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.PEOPLE", properties).collect().length === 3)
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
     assert(
-      ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
+      sqlContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties)
       .collect().length === 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
+    assert(sqlContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties)
       .collect().length === 3)
   }
 
@@ -330,9 +330,9 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("test DATE types") {
-    val rows = ctx.read.jdbc(
+    val rows = sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
-    val cachedRows = ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+    val cachedRows = sqlContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
       .cache().collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
     assert(rows(1).getAs[java.sql.Date](1) === null)
@@ -340,8 +340,8 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("test DATE types in cache") {
-    val rows = ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
-    ctx.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
+    val rows = sqlContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties).collect()
+    sqlContext.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties)
       .cache().registerTempTable("mycached_date")
     val cachedRows = sql("select * from mycached_date").collect()
     assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01"))
@@ -349,7 +349,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   }
 
   test("test types for null value") {
-    val rows = ctx.read.jdbc(
+    val rows = sqlContext.read.jdbc(
       urlWithUserAndPass, "TEST.NULLTYPES", new Properties).collect()
     assert((0 to 14).forall(i => rows(0).isNullAt(i)))
   }
@@ -396,7 +396,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
 
   test("Remap types via JdbcDialects") {
     JdbcDialects.registerDialect(testH2Dialect)
-    val df = ctx.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
+    val df = sqlContext.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties)
     assert(df.schema.filter(_.dataType != org.apache.spark.sql.types.StringType).isEmpty)
     val rows = df.collect()
     assert(rows(0).get(0).isInstanceOf[String])
@@ -407,6 +407,8 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
   test("Default jdbc dialect registration") {
     assert(JdbcDialects.get("jdbc:mysql://127.0.0.1/db") == MySQLDialect)
     assert(JdbcDialects.get("jdbc:postgresql://127.0.0.1/db") == PostgresDialect)
+    assert(JdbcDialects.get("jdbc:db2://127.0.0.1/db") == DB2Dialect)
+    assert(JdbcDialects.get("jdbc:sqlserver://127.0.0.1/db") == MsSqlServerDialect)
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 
@@ -443,4 +445,24 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(agg.getCatalystType(0, "", 1, null) === Some(LongType))
     assert(agg.getCatalystType(1, "", 1, null) === Some(StringType))
   }
+
+  test("DB2Dialect type mapping") {
+    val db2Dialect = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
+    assert(db2Dialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB")
+    assert(db2Dialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "CHAR(1)")
+  }
+
+  test("table exists query by jdbc dialect") {
+    val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
+    val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+    val db2 = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
+    val h2 = JdbcDialects.get(url)
+    val table = "weblogs"
+    val defaultQuery = s"SELECT * FROM $table WHERE 1=0"
+    val limitQuery = s"SELECT 1 FROM $table LIMIT 1"
+    assert(MySQL.getTableExistsQuery(table) == limitQuery)
+    assert(Postgres.getTableExistsQuery(table) == limitQuery)
+    assert(db2.getTableExistsQuery(table) == defaultQuery)
+    assert(h2.getTableExistsQuery(table) == defaultQuery)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 5dc3a2c07b8c7..e23ee6693133b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -22,13 +22,12 @@ import java.util.Properties
 
 import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{Row, SaveMode}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext {
+class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
 
   val url = "jdbc:h2:mem:testdb2"
   var conn: java.sql.Connection = null
@@ -76,8 +75,6 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLCon
     conn1.close()
   }
 
-  private lazy val sc = ctx.sparkContext
-
   private lazy val arr2x2 = Array[Row](Row.apply("dave", 42), Row.apply("mary", 222))
   private lazy val arr1x2 = Array[Row](Row.apply("fred", 3))
   private lazy val schema2 = StructType(
@@ -91,49 +88,50 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLCon
       StructField("seq", IntegerType) :: Nil)
 
   test("Basic CREATE") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
 
     df.write.jdbc(url, "TEST.BASICCREATETEST", new Properties)
-    assert(2 === ctx.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
-    assert(2 === ctx.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).count)
+    assert(
+      2 === sqlContext.read.jdbc(url, "TEST.BASICCREATETEST", new Properties).collect()(0).length)
   }
 
   test("CREATE with overwrite") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x3), schema3)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x3), schema3)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url1, "TEST.DROPTEST", properties)
-    assert(2 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(3 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(3 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
 
     df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.DROPTEST", properties)
-    assert(1 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
+    assert(1 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.DROPTEST", properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to append") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url, "TEST.APPENDTEST", new Properties)
     df2.write.mode(SaveMode.Append).jdbc(url, "TEST.APPENDTEST", new Properties)
-    assert(3 === ctx.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
-    assert(2 === ctx.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
+    assert(3 === sqlContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).count)
+    assert(2 === sqlContext.read.jdbc(url, "TEST.APPENDTEST", new Properties).collect()(0).length)
   }
 
   test("CREATE then INSERT to truncate") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr1x2), schema2)
 
     df.write.jdbc(url1, "TEST.TRUNCATETEST", properties)
     df2.write.mode(SaveMode.Overwrite).jdbc(url1, "TEST.TRUNCATETEST", properties)
-    assert(1 === ctx.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
+    assert(1 === sqlContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.TRUNCATETEST", properties).collect()(0).length)
   }
 
   test("Incompatible INSERT to append") {
-    val df = ctx.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val df2 = ctx.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val df = sqlContext.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    val df2 = sqlContext.createDataFrame(sparkContext.parallelize(arr2x3), schema3)
 
     df.write.jdbc(url, "TEST.INCOMPATIBLETEST", new Properties)
     intercept[org.apache.spark.SparkException] {
@@ -143,14 +141,14 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLCon
 
   test("INSERT to JDBC Datasource") {
     sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
 
   test("INSERT to JDBC Datasource with overwrite") {
     sql("INSERT INTO TABLE PEOPLE1 SELECT * FROM PEOPLE")
     sql("INSERT OVERWRITE TABLE PEOPLE1 SELECT * FROM PEOPLE")
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
-    assert(2 === ctx.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).count)
+    assert(2 === sqlContext.read.jdbc(url1, "TEST.PEOPLE1", properties).collect()(0).length)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index 9bc3f6bcf6fce..6fc9febe49707 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -26,10 +26,8 @@ import org.apache.spark.sql.execution.datasources.DDLException
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
-
 class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter {
   protected override lazy val sql = caseInsensitiveContext.sql _
-  private lazy val sparkContext = caseInsensitiveContext.sparkContext
   private var path: File = null
 
   override def beforeAll(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index d74d29fb0beb0..af04079ec895a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -19,13 +19,11 @@ package org.apache.spark.sql.sources
 
 import org.apache.spark.sql._
 
-
 private[sql] abstract class DataSourceTest extends QueryTest {
-  protected def _sqlContext: SQLContext
 
   // We want to test some edge cases.
   protected lazy val caseInsensitiveContext: SQLContext = {
-    val ctx = new SQLContext(_sqlContext.sparkContext)
+    val ctx = new SQLContext(sqlContext.sparkContext)
     ctx.setConf(SQLConf.CASE_SENSITIVE, false)
     ctx
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 084d83f6e9bff..5b70d258d6ce3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -19,13 +19,12 @@ package org.apache.spark.sql.sources
 
 import java.io.File
 
-import org.apache.spark.sql.{SaveMode, AnalysisException, Row}
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
 class InsertSuite extends DataSourceTest with SharedSQLContext {
   protected override lazy val sql = caseInsensitiveContext.sql _
-  private lazy val sparkContext = caseInsensitiveContext.sparkContext
   private var path: File = null
 
   override def beforeAll(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 79b6e9b45c009..c9791879ec74c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -29,11 +29,11 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     val path = Utils.createTempDir()
     path.delete()
 
-    val df = ctx.range(100).select($"id", lit(1).as("data"))
+    val df = sqlContext.range(100).select($"id", lit(1).as("data"))
     df.write.partitionBy("id").save(path.getCanonicalPath)
 
     checkAnswer(
-      ctx.read.load(path.getCanonicalPath),
+      sqlContext.read.load(path.getCanonicalPath),
       (0 to 99).map(Row(1, _)).toSeq)
 
     Utils.deleteRecursively(path)
@@ -43,12 +43,12 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     val path = Utils.createTempDir()
     path.delete()
 
-    val base = ctx.range(100)
+    val base = sqlContext.range(100)
     val df = base.unionAll(base).select($"id", lit(1).as("data"))
     df.write.partitionBy("id").save(path.getCanonicalPath)
 
     checkAnswer(
-      ctx.read.load(path.getCanonicalPath),
+      sqlContext.read.load(path.getCanonicalPath),
       (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq)
 
     Utils.deleteRecursively(path)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index f18546b4c2d9b..10d261368993d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -28,7 +28,6 @@ import org.apache.spark.util.Utils
 
 class SaveLoadSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter {
   protected override lazy val sql = caseInsensitiveContext.sql _
-  private lazy val sparkContext = caseInsensitiveContext.sparkContext
   private var originalDefaultSource: String = null
   private var path: File = null
   private var df: DataFrame = null
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
index 1374a97476ca1..520dea7f7dd92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
@@ -24,11 +24,11 @@ import org.apache.spark.sql.{DataFrame, SQLContext, SQLImplicits}
  * A collection of sample data used in SQL tests.
  */
 private[sql] trait SQLTestData { self =>
-  protected def _sqlContext: SQLContext
+  protected def sqlContext: SQLContext
 
   // Helper object to import SQL implicits without a concrete SQLContext
   private object internalImplicits extends SQLImplicits {
-    protected override def _sqlContext: SQLContext = self._sqlContext
+    protected override def _sqlContext: SQLContext = self.sqlContext
   }
 
   import internalImplicits._
@@ -36,15 +36,22 @@ private[sql] trait SQLTestData { self =>
 
   // Note: all test data should be lazy because the SQLContext is not set up yet.
 
+  protected lazy val emptyTestData: DataFrame = {
+    val df = sqlContext.sparkContext.parallelize(
+      Seq.empty[Int].map(i => TestData(i, i.toString))).toDF()
+    df.registerTempTable("emptyTestData")
+    df
+  }
+
   protected lazy val testData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       (1 to 100).map(i => TestData(i, i.toString))).toDF()
     df.registerTempTable("testData")
     df
   }
 
   protected lazy val testData2: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       TestData2(1, 1) ::
       TestData2(1, 2) ::
       TestData2(2, 1) ::
@@ -56,7 +63,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val testData3: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       TestData3(1, None) ::
       TestData3(2, Some(2)) :: Nil).toDF()
     df.registerTempTable("testData3")
@@ -64,14 +71,14 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val negativeData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       (1 to 100).map(i => TestData(-i, (-i).toString))).toDF()
     df.registerTempTable("negativeData")
     df
   }
 
   protected lazy val largeAndSmallInts: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       LargeAndSmallInts(2147483644, 1) ::
       LargeAndSmallInts(1, 2) ::
       LargeAndSmallInts(2147483645, 1) ::
@@ -83,7 +90,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val decimalData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       DecimalData(1, 1) ::
       DecimalData(1, 2) ::
       DecimalData(2, 1) ::
@@ -95,7 +102,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val binaryData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       BinaryData("12".getBytes, 1) ::
       BinaryData("22".getBytes, 5) ::
       BinaryData("122".getBytes, 3) ::
@@ -106,7 +113,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val upperCaseData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       UpperCaseData(1, "A") ::
       UpperCaseData(2, "B") ::
       UpperCaseData(3, "C") ::
@@ -118,7 +125,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val lowerCaseData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       LowerCaseData(1, "a") ::
       LowerCaseData(2, "b") ::
       LowerCaseData(3, "c") ::
@@ -128,7 +135,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val arrayData: RDD[ArrayData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(
+    val rdd = sqlContext.sparkContext.parallelize(
       ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) ::
       ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil)
     rdd.toDF().registerTempTable("arrayData")
@@ -136,7 +143,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val mapData: RDD[MapData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(
+    val rdd = sqlContext.sparkContext.parallelize(
       MapData(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) ::
       MapData(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) ::
       MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
@@ -147,13 +154,13 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val repeatedData: RDD[StringData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
+    val rdd = sqlContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
     rdd.toDF().registerTempTable("repeatedData")
     rdd
   }
 
   protected lazy val nullableRepeatedData: RDD[StringData] = {
-    val rdd = _sqlContext.sparkContext.parallelize(
+    val rdd = sqlContext.sparkContext.parallelize(
       List.fill(2)(StringData(null)) ++
       List.fill(2)(StringData("test")))
     rdd.toDF().registerTempTable("nullableRepeatedData")
@@ -161,7 +168,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val nullInts: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       NullInts(1) ::
       NullInts(2) ::
       NullInts(3) ::
@@ -171,7 +178,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val allNulls: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       NullInts(null) ::
       NullInts(null) ::
       NullInts(null) ::
@@ -181,7 +188,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val nullStrings: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       NullStrings(1, "abc") ::
       NullStrings(2, "ABC") ::
       NullStrings(3, null) :: Nil).toDF()
@@ -190,13 +197,13 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val tableName: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(TableName("test") :: Nil).toDF()
+    val df = sqlContext.sparkContext.parallelize(TableName("test") :: Nil).toDF()
     df.registerTempTable("tableName")
     df
   }
 
   protected lazy val unparsedStrings: RDD[String] = {
-    _sqlContext.sparkContext.parallelize(
+    sqlContext.sparkContext.parallelize(
       "1, A1, true, null" ::
       "2, B2, false, null" ::
       "3, C3, true, null" ::
@@ -205,13 +212,13 @@ private[sql] trait SQLTestData { self =>
 
   // An RDD with 4 elements and 8 partitions
   protected lazy val withEmptyParts: RDD[IntField] = {
-    val rdd = _sqlContext.sparkContext.parallelize((1 to 4).map(IntField), 8)
+    val rdd = sqlContext.sparkContext.parallelize((1 to 4).map(IntField), 8)
     rdd.toDF().registerTempTable("withEmptyParts")
     rdd
   }
 
   protected lazy val person: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       Person(0, "mike", 30) ::
       Person(1, "jim", 20) :: Nil).toDF()
     df.registerTempTable("person")
@@ -219,7 +226,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val salary: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       Salary(0, 2000.0) ::
       Salary(1, 1000.0) :: Nil).toDF()
     df.registerTempTable("salary")
@@ -227,7 +234,7 @@ private[sql] trait SQLTestData { self =>
   }
 
   protected lazy val complexData: DataFrame = {
-    val df = _sqlContext.sparkContext.parallelize(
+    val df = sqlContext.sparkContext.parallelize(
       ComplexData(Map("1" -> 1), TestData(1, "1"), Seq(1, 1, 1), true) ::
       ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2, 2, 2), false) ::
       Nil).toDF()
@@ -239,7 +246,8 @@ private[sql] trait SQLTestData { self =>
    * Initialize all test data such that all temp tables are properly registered.
    */
   def loadTestData(): Unit = {
-    assert(_sqlContext != null, "attempted to initialize test data before SQLContext.")
+    assert(sqlContext != null, "attempted to initialize test data before SQLContext.")
+    emptyTestData
     testData
     testData2
     testData3
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index cdd691e035897..9214569f18e93 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -27,8 +27,9 @@ import org.apache.hadoop.conf.Configuration
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{DataFrame, SQLContext, SQLImplicits}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.util.Utils
 
 /**
@@ -46,13 +47,13 @@ private[sql] trait SQLTestUtils
   with BeforeAndAfterAll
   with SQLTestData { self =>
 
-  protected def _sqlContext: SQLContext
+  protected def sparkContext = sqlContext.sparkContext
 
   // Whether to materialize all test data before the first test is run
   private var loadTestDataBeforeTests = false
 
   // Shorthand for running a query using our SQLContext
-  protected lazy val sql = _sqlContext.sql _
+  protected lazy val sql = sqlContext.sql _
 
   /**
    * A helper object for importing SQL implicits.
@@ -62,7 +63,14 @@ private[sql] trait SQLTestUtils
    * but the implicits import is needed in the constructor.
    */
   protected object testImplicits extends SQLImplicits {
-    protected override def _sqlContext: SQLContext = self._sqlContext
+    protected override def _sqlContext: SQLContext = self.sqlContext
+
+    // This must live here to preserve binary compatibility with Spark < 1.5.
+    implicit class StringToColumn(val sc: StringContext) {
+      def $(args: Any*): ColumnName = {
+        new ColumnName(sc.s(args: _*))
+      }
+    }
   }
 
   /**
@@ -83,8 +91,8 @@ private[sql] trait SQLTestUtils
   /**
    * The Hadoop configuration used by the active [[SQLContext]].
    */
-  protected def configuration: Configuration = {
-    _sqlContext.sparkContext.hadoopConfiguration
+  protected def hadoopConfiguration: Configuration = {
+    sparkContext.hadoopConfiguration
   }
 
   /**
@@ -95,12 +103,12 @@ private[sql] trait SQLTestUtils
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(_sqlContext.conf.getConfString(key)).toOption)
-    (keys, values).zipped.foreach(_sqlContext.conf.setConfString)
+    val currentValues = keys.map(key => Try(sqlContext.conf.getConfString(key)).toOption)
+    (keys, values).zipped.foreach(sqlContext.conf.setConfString)
     try f finally {
       keys.zip(currentValues).foreach {
-        case (key, Some(value)) => _sqlContext.conf.setConfString(key, value)
-        case (key, None) => _sqlContext.conf.unsetConf(key)
+        case (key, Some(value)) => sqlContext.conf.setConfString(key, value)
+        case (key, None) => sqlContext.conf.unsetConf(key)
       }
     }
   }
@@ -132,7 +140,7 @@ private[sql] trait SQLTestUtils
    * Drops temporary table `tableName` after calling `f`.
    */
   protected def withTempTable(tableNames: String*)(f: => Unit): Unit = {
-    try f finally tableNames.foreach(_sqlContext.dropTempTable)
+    try f finally tableNames.foreach(sqlContext.dropTempTable)
   }
 
   /**
@@ -141,7 +149,7 @@ private[sql] trait SQLTestUtils
   protected def withTable(tableNames: String*)(f: => Unit): Unit = {
     try f finally {
       tableNames.foreach { name =>
-        _sqlContext.sql(s"DROP TABLE IF EXISTS $name")
+        sqlContext.sql(s"DROP TABLE IF EXISTS $name")
       }
     }
   }
@@ -154,12 +162,12 @@ private[sql] trait SQLTestUtils
     val dbName = s"db_${UUID.randomUUID().toString.replace('-', '_')}"
 
     try {
-      _sqlContext.sql(s"CREATE DATABASE $dbName")
+      sqlContext.sql(s"CREATE DATABASE $dbName")
     } catch { case cause: Throwable =>
       fail("Failed to create temporary database", cause)
     }
 
-    try f(dbName) finally _sqlContext.sql(s"DROP DATABASE $dbName CASCADE")
+    try f(dbName) finally sqlContext.sql(s"DROP DATABASE $dbName CASCADE")
   }
 
   /**
@@ -167,8 +175,8 @@ private[sql] trait SQLTestUtils
    * `f` returns.
    */
   protected def activateDatabase(db: String)(f: => Unit): Unit = {
-    _sqlContext.sql(s"USE $db")
-    try f finally _sqlContext.sql(s"USE default")
+    sqlContext.sql(s"USE $db")
+    try f finally sqlContext.sql(s"USE default")
   }
 
   /**
@@ -176,6 +184,49 @@ private[sql] trait SQLTestUtils
    * way to construct [[DataFrame]] directly out of local data without relying on implicits.
    */
   protected implicit def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = {
-    DataFrame(_sqlContext, plan)
+    DataFrame(sqlContext, plan)
+  }
+}
+
+private[sql] object SQLTestUtils {
+
+  def compareAnswers(
+      sparkAnswer: Seq[Row],
+      expectedAnswer: Seq[Row],
+      sort: Boolean): Option[String] = {
+    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
+      // Converts data to types that we can do equality comparison using Scala collections.
+      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
+      // Java's java.math.BigDecimal.compareTo).
+      // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for
+      // equality test.
+      // This function is copied from Catalyst's QueryTest
+      val converted: Seq[Row] = answer.map { s =>
+        Row.fromSeq(s.toSeq.map {
+          case d: java.math.BigDecimal => BigDecimal(d)
+          case b: Array[Byte] => b.toSeq
+          case o => o
+        })
+      }
+      if (sort) {
+        converted.sortBy(_.toString())
+      } else {
+        converted
+      }
+    }
+    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
+      val errorMessage =
+        s"""
+           | == Results ==
+           | ${sideBySide(
+          s"== Expected Answer - ${expectedAnswer.size} ==" +:
+            prepareAnswer(expectedAnswer).map(_.toString()),
+          s"== Actual Answer - ${sparkAnswer.size} ==" +:
+            prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
+      """.stripMargin
+      Some(errorMessage)
+    } else {
+      None
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index 8a061b6bc690d..963d10eed62ed 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql.test
 
-import org.apache.spark.sql.{ColumnName, SQLContext}
+import org.apache.spark.sql.SQLContext
 
 
 /**
  * Helper trait for SQL test suites where all tests share a single [[TestSQLContext]].
  */
-private[sql] trait SharedSQLContext extends SQLTestUtils {
+trait SharedSQLContext extends SQLTestUtils {
 
   /**
    * The [[TestSQLContext]] to use for all tests in this suite.
@@ -36,9 +36,7 @@ private[sql] trait SharedSQLContext extends SQLTestUtils {
   /**
    * The [[TestSQLContext]] to use for all tests in this suite.
    */
-  protected def ctx: TestSQLContext = _ctx
-  protected def sqlContext: TestSQLContext = _ctx
-  protected override def _sqlContext: SQLContext = _ctx
+  protected def sqlContext: SQLContext = _ctx
 
   /**
    * Initialize the [[TestSQLContext]].
@@ -64,15 +62,4 @@ private[sql] trait SharedSQLContext extends SQLTestUtils {
       super.afterAll()
     }
   }
-
-  /**
-   * Converts $"col name" into an [[Column]].
-   * @since 1.3.0
-   */
-  // This must be duplicated here to preserve binary compatibility with Spark < 1.5.
-  implicit class StringToColumn(val sc: StringContext) {
-    def $(args: Any*): ColumnName = {
-      new ColumnName(sc.s(args: _*))
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 92ef2f7d74ba1..10e633f3cde46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -31,13 +31,24 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
       new SparkConf().set("spark.sql.testkey", "true")))
   }
 
-  // Use fewer partitions to speed up testing
+  // Make sure we set those test specific confs correctly when we create
+  // the SQLConf as well as when we call clear.
   protected[sql] override def createSession(): SQLSession = new this.SQLSession()
 
   /** A special [[SQLSession]] that uses fewer shuffle partitions than normal. */
   protected[sql] class SQLSession extends super.SQLSession {
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def numShufflePartitions: Int = this.getConf(SQLConf.SHUFFLE_PARTITIONS, 5)
+
+      clear()
+
+      override def clear(): Unit = {
+        super.clear()
+
+        // Make sure we start with the default test configs even after clear
+        TestSQLContext.overrideConfs.map {
+          case (key, value) => setConfString(key, value)
+        }
+      }
     }
   }
 
@@ -47,6 +58,17 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
   }
 
   private object testData extends SQLTestData {
-    protected override def _sqlContext: SQLContext = self
+    protected override def sqlContext: SQLContext = self
   }
 }
+
+private[sql] object TestSQLContext {
+
+  /**
+   * A map used to store all confs that need to be overridden in sql/core unit tests.
+   */
+  val overrideConfs: Map[String, String] =
+    Map(
+      // Fewer shuffle partitions to speed up testing.
+      SQLConf.SHUFFLE_PARTITIONS.key -> "5")
+}
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 3566c87dd248c..f7fe085f34d84 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index dd9fef9206d0b..a4fd0c3ce9702 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -55,7 +55,6 @@ object HiveThriftServer2 extends Logging {
   @DeveloperApi
   def startWithContext(sqlContext: HiveContext): Unit = {
     val server = new HiveThriftServer2(sqlContext)
-    sqlContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
     server.init(sqlContext.hiveconf)
     server.start()
     listener = new HiveThriftServer2Listener(server, sqlContext.conf)
@@ -93,6 +92,12 @@ object HiveThriftServer2 extends Logging {
       } else {
         None
       }
+      // If application was killed before HiveThriftServer2 start successfully then SparkSubmit
+      // process can not exit, so check whether if SparkContext was stopped.
+      if (SparkSQLEnv.sparkContext.stopped.get()) {
+        logError("SparkContext has stopped even if HiveServer2 has started, so exit")
+        System.exit(-1)
+      }
     } catch {
       case e: Exception =>
         logError("Error starting HiveThriftServer2", e)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index a29df567983b1..b5073961a1c84 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -171,6 +171,7 @@ private[hive] object SparkSQLCLIDriver extends Logging {
 
     val reader = new ConsoleReader()
     reader.setBellEnabled(false)
+    reader.setExpandEvents(false)
     // reader.setDebug(new PrintWriter(new FileWriter("writer.debug", true)))
     CliDriver.getCommandCompleter.foreach((e) => reader.addCompleter(e))
 
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index b72249b3bf8c0..19b2f24456ab0 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.net.URL
 import java.sql.{Date, DriverManager, SQLException, Statement}
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration._
@@ -431,6 +432,32 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
       }
     )
   }
+
+  test("Checks Hive version via SET -v") {
+    withJdbcStatement { statement =>
+      val resultSet = statement.executeQuery("SET -v")
+
+      val conf = mutable.Map.empty[String, String]
+      while (resultSet.next()) {
+        conf += resultSet.getString(1) -> resultSet.getString(2)
+      }
+
+      assert(conf.get("spark.sql.hive.version") === Some("1.2.1"))
+    }
+  }
+
+  test("Checks Hive version via SET") {
+    withJdbcStatement { statement =>
+      val resultSet = statement.executeQuery("SET")
+
+      val conf = mutable.Map.empty[String, String]
+      while (resultSet.next()) {
+        conf += resultSet.getString(1) -> resultSet.getString(2)
+      }
+
+      assert(conf.get("spark.sql.hive.version") === Some("1.2.1"))
+    }
+  }
 }
 
 class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
deleted file mode 100644
index 1a5ba20404c4e..0000000000000
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import java.io.File
-
-import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.hive.test.TestHive
-
-/**
- * Runs the test cases that are included in the hive distribution with hash joins.
- */
-class HashJoinCompatibilitySuite extends HiveCompatibilitySuite {
-  override def beforeAll() {
-    super.beforeAll()
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
-  }
-
-  override def afterAll() {
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
-    super.afterAll()
-  }
-
-  override def whiteList = Seq(
-    "auto_join0",
-    "auto_join1",
-    "auto_join10",
-    "auto_join11",
-    "auto_join12",
-    "auto_join13",
-    "auto_join14",
-    "auto_join14_hadoop20",
-    "auto_join15",
-    "auto_join17",
-    "auto_join18",
-    "auto_join19",
-    "auto_join2",
-    "auto_join20",
-    "auto_join21",
-    "auto_join22",
-    "auto_join23",
-    "auto_join24",
-    "auto_join25",
-    "auto_join26",
-    "auto_join27",
-    "auto_join28",
-    "auto_join3",
-    "auto_join30",
-    "auto_join31",
-    "auto_join32",
-    "auto_join4",
-    "auto_join5",
-    "auto_join6",
-    "auto_join7",
-    "auto_join8",
-    "auto_join9",
-    "auto_join_filters",
-    "auto_join_nulls",
-    "auto_join_reordering_values",
-    "auto_smb_mapjoin_14",
-    "auto_sortmerge_join_1",
-    "auto_sortmerge_join_10",
-    "auto_sortmerge_join_11",
-    "auto_sortmerge_join_12",
-    "auto_sortmerge_join_13",
-    "auto_sortmerge_join_14",
-    "auto_sortmerge_join_15",
-    "auto_sortmerge_join_16",
-    "auto_sortmerge_join_2",
-    "auto_sortmerge_join_3",
-    "auto_sortmerge_join_4",
-    "auto_sortmerge_join_5",
-    "auto_sortmerge_join_6",
-    "auto_sortmerge_join_7",
-    "auto_sortmerge_join_8",
-    "auto_sortmerge_join_9",
-    "correlationoptimizer1",
-    "correlationoptimizer10",
-    "correlationoptimizer11",
-    "correlationoptimizer13",
-    "correlationoptimizer14",
-    "correlationoptimizer15",
-    "correlationoptimizer2",
-    "correlationoptimizer3",
-    "correlationoptimizer4",
-    "correlationoptimizer6",
-    "correlationoptimizer7",
-    "correlationoptimizer8",
-    "correlationoptimizer9",
-    "join0",
-    "join1",
-    "join10",
-    "join11",
-    "join12",
-    "join13",
-    "join14",
-    "join14_hadoop20",
-    "join15",
-    "join16",
-    "join17",
-    "join18",
-    "join19",
-    "join2",
-    "join20",
-    "join21",
-    "join22",
-    "join23",
-    "join24",
-    "join25",
-    "join26",
-    "join27",
-    "join28",
-    "join29",
-    "join3",
-    "join30",
-    "join31",
-    "join32",
-    "join32_lessSize",
-    "join33",
-    "join34",
-    "join35",
-    "join36",
-    "join37",
-    "join38",
-    "join39",
-    "join4",
-    "join40",
-    "join41",
-    "join5",
-    "join6",
-    "join7",
-    "join8",
-    "join9",
-    "join_1to1",
-    "join_array",
-    "join_casesensitive",
-    "join_empty",
-    "join_filters",
-    "join_hive_626",
-    "join_map_ppr",
-    "join_nulls",
-    "join_nullsafe",
-    "join_rc",
-    "join_reorder2",
-    "join_reorder3",
-    "join_reorder4",
-    "join_star"
-  )
-
-  // Only run those query tests in the realWhileList (do not try other ignored query files).
-  override def testCases: Seq[(String, File)] = super.testCases.filter {
-    case (name, _) => realWhiteList.contains(name)
-  }
-}
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index be1607476e254..ac67fe5f47be9 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 2e791cea96b41..17de8ef56f9a6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -111,8 +111,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * this does not necessarily need to be the same version of Hive that is used internally by
    * Spark SQL for execution.
    */
-  protected[hive] def hiveMetastoreVersion: String =
-    getConf(HIVE_METASTORE_VERSION, hiveExecutionVersion)
+  protected[hive] def hiveMetastoreVersion: String = getConf(HIVE_METASTORE_VERSION)
 
   /**
    * The location of the jars that should be used to instantiate the HiveMetastoreClient.  This
@@ -202,7 +201,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
           "Builtin jars can only be used when hive execution version == hive metastore version. " +
           s"Execution: ${hiveExecutionVersion} != Metastore: ${hiveMetastoreVersion}. " +
           "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " +
-          s"or change $HIVE_METASTORE_VERSION to $hiveExecutionVersion.")
+          s"or change ${HIVE_METASTORE_VERSION.key} to $hiveExecutionVersion.")
       }
 
       // We recursively find all jars in the class loader chain,
@@ -292,12 +291,12 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * @since 1.3.0
    */
   def refreshTable(tableName: String): Unit = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     catalog.refreshTable(tableIdent)
   }
 
   protected[hive] def invalidateTable(tableName: String): Unit = {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     catalog.invalidateTable(tableIdent)
   }
 
@@ -312,7 +311,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    */
   @Experimental
   def analyze(tableName: String) {
-    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    val tableIdent = SqlParser.parseTableIdentifier(tableName)
     val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq))
 
     relation match {
@@ -564,7 +563,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
 
   /** Extends QueryExecution with hive specific features. */
   protected[sql] class QueryExecution(logicalPlan: LogicalPlan)
-    extends super.QueryExecution(logicalPlan) {
+    extends org.apache.spark.sql.execution.QueryExecution(this, logicalPlan) {
 
     /**
      * Returns the result as a hive compatible sequence of strings.  For native commands, the
@@ -582,10 +581,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
               .mkString("\t")
         }
       case command: ExecutedCommand =>
-        command.executeCollect().map(_(0).toString)
+        command.executeCollect().map(_.getString(0))
 
       case other =>
-        val result: Seq[Seq[Any]] = other.executeCollect().map(_.toSeq).toSeq
+        val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq
         // We need the types so we can output struct field names
         val types = analyzed.output.map(_.dataType)
         // Reformat to match hive tab delimited output.
@@ -606,7 +605,16 @@ private[hive] object HiveContext {
   /** The version of hive used internally by Spark SQL. */
   val hiveExecutionVersion: String = "1.2.1"
 
-  val HIVE_METASTORE_VERSION: String = "spark.sql.hive.metastore.version"
+  val HIVE_METASTORE_VERSION = stringConf("spark.sql.hive.metastore.version",
+    defaultValue = Some(hiveExecutionVersion),
+    doc = "Version of the Hive metastore. Available options are " +
+        s"<code>0.12.0</code> through <code>$hiveExecutionVersion</code>.")
+
+  val HIVE_EXECUTION_VERSION = stringConf(
+    key = "spark.sql.hive.version",
+    defaultValue = Some(hiveExecutionVersion),
+    doc = "Version of Hive used internally by Spark SQL.")
+
   val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars",
     defaultValue = Some("builtin"),
     doc = s"""
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index b8da0840ae569..ea1521a48c8a7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -199,7 +199,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       options: Map[String, String],
       isExternal: Boolean): Unit = {
     createDataSourceTable(
-      new SqlParser().parseTableIdentifier(tableName),
+      SqlParser.parseTableIdentifier(tableName),
       userSpecifiedSchema,
       partitionColumns,
       provider,
@@ -309,73 +309,72 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     }
 
     // TODO: Support persisting partitioned data source relations in Hive compatible format
-    val hiveTable = (maybeSerDe, dataSource.relation) match {
+    val qualifiedTableName = tableIdent.quotedString
+    val (hiveCompitiableTable, logMessage) = (maybeSerDe, dataSource.relation) match {
       case (Some(serde), relation: HadoopFsRelation)
-          if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
-        // Hive ParquetSerDe doesn't support decimal type until 1.2.0.
-        val isParquetSerDe = serde.inputFormat.exists(_.toLowerCase.contains("parquet"))
-        val hasDecimalFields = relation.schema.existsRecursively(_.isInstanceOf[DecimalType])
-
-        val hiveParquetSupportsDecimal = client.version match {
-          case org.apache.spark.sql.hive.client.hive.v1_2 => true
-          case _ => false
-        }
-
-        if (isParquetSerDe && !hiveParquetSupportsDecimal && hasDecimalFields) {
-          // If Hive version is below 1.2.0, we cannot save Hive compatible schema to
-          // metastore when the file format is Parquet and the schema has DecimalType.
-          logWarning {
-            "Persisting Parquet relation with decimal field(s) into Hive metastore in Spark SQL " +
-              "specific format, which is NOT compatible with Hive. Because ParquetHiveSerDe in " +
-              s"Hive ${client.version.fullVersion} doesn't support decimal type. See HIVE-6384."
-          }
-          newSparkSQLSpecificMetastoreTable()
-        } else {
-          logInfo {
-            "Persisting data source relation with a single input path into Hive metastore in " +
-              s"Hive compatible format. Input path: ${relation.paths.head}"
-          }
-          newHiveCompatibleMetastoreTable(relation, serde)
-        }
+        if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
+        val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
+        val message =
+          s"Persisting data source relation $qualifiedTableName with a single input path " +
+            s"into Hive metastore in Hive compatible format. Input path: ${relation.paths.head}."
+        (Some(hiveTable), message)
 
       case (Some(serde), relation: HadoopFsRelation) if relation.partitionColumns.nonEmpty =>
-        logWarning {
-          "Persisting partitioned data source relation into Hive metastore in " +
-            s"Spark SQL specific format, which is NOT compatible with Hive.  Input path(s): " +
-            relation.paths.mkString("\n", "\n", "")
-        }
-        newSparkSQLSpecificMetastoreTable()
+        val message =
+          s"Persisting partitioned data source relation $qualifiedTableName into " +
+            "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " +
+            "Input path(s): " + relation.paths.mkString("\n", "\n", "")
+        (None, message)
 
       case (Some(serde), relation: HadoopFsRelation) =>
-        logWarning {
-          "Persisting data source relation with multiple input paths into Hive metastore in " +
-            s"Spark SQL specific format, which is NOT compatible with Hive.  Input paths: " +
-            relation.paths.mkString("\n", "\n", "")
-        }
-        newSparkSQLSpecificMetastoreTable()
+        val message =
+          s"Persisting data source relation $qualifiedTableName with multiple input paths into " +
+            "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " +
+            s"Input paths: " + relation.paths.mkString("\n", "\n", "")
+        (None, message)
 
       case (Some(serde), _) =>
-        logWarning {
-          s"Data source relation is not a ${classOf[HadoopFsRelation].getSimpleName}. " +
-            "Persisting it into Hive metastore in Spark SQL specific format, " +
-            "which is NOT compatible with Hive."
-        }
-        newSparkSQLSpecificMetastoreTable()
+        val message =
+          s"Data source relation $qualifiedTableName is not a " +
+            s"${classOf[HadoopFsRelation].getSimpleName}. Persisting it into Hive metastore " +
+            "in Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
 
       case _ =>
-        logWarning {
+        val message =
           s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
-            "Persisting data source relation into Hive metastore in Spark SQL specific format, " +
-            "which is NOT compatible with Hive."
-        }
-        newSparkSQLSpecificMetastoreTable()
+            s"Persisting data source relation $qualifiedTableName into Hive metastore in " +
+            s"Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
     }
 
-    client.createTable(hiveTable)
+    (hiveCompitiableTable, logMessage) match {
+      case (Some(table), message) =>
+        // We first try to save the metadata of the table in a Hive compatiable way.
+        // If Hive throws an error, we fall back to save its metadata in the Spark SQL
+        // specific way.
+        try {
+          logInfo(message)
+          client.createTable(table)
+        } catch {
+          case throwable: Throwable =>
+            val warningMessage =
+              s"Could not persist $qualifiedTableName in a Hive compatible way. Persisting " +
+                s"it into Hive metastore in Spark SQL specific format."
+            logWarning(warningMessage, throwable)
+            val sparkSqlSpecificTable = newSparkSQLSpecificMetastoreTable()
+            client.createTable(sparkSqlSpecificTable)
+        }
+
+      case (None, message) =>
+        logWarning(message)
+        val hiveTable = newSparkSQLSpecificMetastoreTable()
+        client.createTable(hiveTable)
+    }
   }
 
   def hiveDefaultTableFilePath(tableName: String): String = {
-    hiveDefaultTableFilePath(new SqlParser().parseTableIdentifier(tableName))
+    hiveDefaultTableFilePath(SqlParser.parseTableIdentifier(tableName))
   }
 
   def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = {
@@ -449,7 +448,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
       cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => None // Cache miss
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation) =>
+        case logical @ LogicalRelation(parquetRelation: ParquetRelation, _) =>
           // If we have the same paths, same schema, and same partition spec,
           // we will use the cached Parquet Relation.
           val useCached =
@@ -515,7 +514,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       parquetRelation
     }
 
-    result.newInstance()
+    result.copy(expectedOutputAttributes = Some(metastoreRelation.output))
   }
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
@@ -554,60 +553,28 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         return plan
       }
 
-      // Collects all `MetastoreRelation`s which should be replaced
-      val toBeReplaced = plan.collect {
+      plan transformUp {
         // Write path
-        case InsertIntoTable(relation: MetastoreRelation, _, _, _, _)
-            // Inserting into partitioned table is not supported in Parquet data source (yet).
-            if !relation.hiveQlTable.isPartitioned &&
-              hive.convertMetastoreParquet &&
-              relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
-          val parquetRelation = convertToParquetRelation(relation)
-          val attributedRewrites = relation.output.zip(parquetRelation.output)
-          (relation, parquetRelation, attributedRewrites)
+        case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
+          // Inserting into partitioned table is not supported in Parquet data source (yet).
+          if !r.hiveQlTable.isPartitioned && hive.convertMetastoreParquet &&
+            r.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          val parquetRelation = convertToParquetRelation(r)
+          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
 
         // Write path
-        case InsertIntoHiveTable(relation: MetastoreRelation, _, _, _, _)
+        case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
           // Inserting into partitioned table is not supported in Parquet data source (yet).
-          if !relation.hiveQlTable.isPartitioned &&
-            hive.convertMetastoreParquet &&
-            relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
-          val parquetRelation = convertToParquetRelation(relation)
-          val attributedRewrites = relation.output.zip(parquetRelation.output)
-          (relation, parquetRelation, attributedRewrites)
+          if !r.hiveQlTable.isPartitioned && hive.convertMetastoreParquet &&
+            r.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          val parquetRelation = convertToParquetRelation(r)
+          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
 
         // Read path
         case relation: MetastoreRelation if hive.convertMetastoreParquet &&
-              relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
           val parquetRelation = convertToParquetRelation(relation)
-          val attributedRewrites = relation.output.zip(parquetRelation.output)
-          (relation, parquetRelation, attributedRewrites)
-      }
-
-      val relationMap = toBeReplaced.map(r => (r._1, r._2)).toMap
-      val attributedRewrites = AttributeMap(toBeReplaced.map(_._3).fold(Nil)(_ ++: _))
-
-      // Replaces all `MetastoreRelation`s with corresponding `ParquetRelation2`s, and fixes
-      // attribute IDs referenced in other nodes.
-      plan.transformUp {
-        case r: MetastoreRelation if relationMap.contains(r) =>
-          val parquetRelation = relationMap(r)
-          val alias = r.alias.getOrElse(r.tableName)
-          Subquery(alias, parquetRelation)
-
-        case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
-          if relationMap.contains(r) =>
-          val parquetRelation = relationMap(r)
-          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
-
-        case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite, ifNotExists)
-          if relationMap.contains(r) =>
-          val parquetRelation = relationMap(r)
-          InsertIntoTable(parquetRelation, partition, child, overwrite, ifNotExists)
-
-        case other => other.transformExpressions {
-          case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
-        }
+          Subquery(relation.alias.getOrElse(relation.tableName), parquetRelation)
       }
     }
   }
@@ -767,7 +734,7 @@ private[hive] case class InsertIntoHiveTable(
 private[hive] case class MetastoreRelation
     (databaseName: String, tableName: String, alias: Option[String])
     (val table: HiveTable)
-    (@transient sqlContext: SQLContext)
+    (@transient private val sqlContext: SQLContext)
   extends LeafNode with MultiInstanceRelation with FileRelation {
 
   override def equals(other: Any): Boolean = other match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index d5cd7e98b5267..256440a9a2e97 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.AnalysisException
@@ -884,16 +885,22 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                   AttributeReference("value", StringType)()), true)
             }
 
-            def matchSerDe(clause: Seq[ASTNode])
-              : (Seq[(String, String)], Option[String], Seq[(String, String)]) = clause match {
+            type SerDeInfo = (
+              Seq[(String, String)],  // Input row format information
+              Option[String],         // Optional input SerDe class
+              Seq[(String, String)],  // Input SerDe properties
+              Boolean                 // Whether to use default record reader/writer
+            )
+
+            def matchSerDe(clause: Seq[ASTNode]): SerDeInfo = clause match {
               case Token("TOK_SERDEPROPS", propsClause) :: Nil =>
                 val rowFormat = propsClause.map {
                   case Token(name, Token(value, Nil) :: Nil) => (name, value)
                 }
-                (rowFormat, None, Nil)
+                (rowFormat, None, Nil, false)
 
               case Token("TOK_SERDENAME", Token(serdeClass, Nil) :: Nil) :: Nil =>
-                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), Nil)
+                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), Nil, false)
 
               case Token("TOK_SERDENAME", Token(serdeClass, Nil) ::
                 Token("TOK_TABLEPROPERTIES",
@@ -903,20 +910,47 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                     (BaseSemanticAnalyzer.unescapeSQLString(name),
                       BaseSemanticAnalyzer.unescapeSQLString(value))
                 }
-                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), serdeProps)
 
-              case Nil => (Nil, Option(hiveConf.getVar(ConfVars.HIVESCRIPTSERDE)), Nil)
+                // SPARK-10310: Special cases LazySimpleSerDe
+                // TODO Fully supports user-defined record reader/writer classes
+                val unescapedSerDeClass = BaseSemanticAnalyzer.unescapeSQLString(serdeClass)
+                val useDefaultRecordReaderWriter =
+                  unescapedSerDeClass == classOf[LazySimpleSerDe].getCanonicalName
+                (Nil, Some(unescapedSerDeClass), serdeProps, useDefaultRecordReaderWriter)
+
+              case Nil =>
+                // Uses default TextRecordReader/TextRecordWriter, sets field delimiter here
+                val serdeProps = Seq(serdeConstants.FIELD_DELIM -> "\t")
+                (Nil, Option(hiveConf.getVar(ConfVars.HIVESCRIPTSERDE)), serdeProps, true)
             }
 
-            val (inRowFormat, inSerdeClass, inSerdeProps) = matchSerDe(inputSerdeClause)
-            val (outRowFormat, outSerdeClass, outSerdeProps) = matchSerDe(outputSerdeClause)
+            val (inRowFormat, inSerdeClass, inSerdeProps, useDefaultRecordReader) =
+              matchSerDe(inputSerdeClause)
+
+            val (outRowFormat, outSerdeClass, outSerdeProps, useDefaultRecordWriter) =
+              matchSerDe(outputSerdeClause)
 
             val unescapedScript = BaseSemanticAnalyzer.unescapeSQLString(script)
 
+            // TODO Adds support for user-defined record reader/writer classes
+            val recordReaderClass = if (useDefaultRecordReader) {
+              Option(hiveConf.getVar(ConfVars.HIVESCRIPTRECORDREADER))
+            } else {
+              None
+            }
+
+            val recordWriterClass = if (useDefaultRecordWriter) {
+              Option(hiveConf.getVar(ConfVars.HIVESCRIPTRECORDWRITER))
+            } else {
+              None
+            }
+
             val schema = HiveScriptIOSchema(
               inRowFormat, outRowFormat,
               inSerdeClass, outSerdeClass,
-              inSerdeProps, outSerdeProps, schemaLess)
+              inSerdeProps, outSerdeProps,
+              recordReaderClass, recordWriterClass,
+              schemaLess)
 
             Some(
               logical.ScriptTransformation(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index dc355690852bd..e35468a624c3e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -54,10 +54,10 @@ private[hive] sealed trait TableReader {
  */
 private[hive]
 class HadoopTableReader(
-    @transient attributes: Seq[Attribute],
-    @transient relation: MetastoreRelation,
-    @transient sc: HiveContext,
-    @transient hiveExtraConf: HiveConf)
+    @transient private val attributes: Seq[Attribute],
+    @transient private val relation: MetastoreRelation,
+    @transient private val sc: HiveContext,
+    hiveExtraConf: HiveConf)
   extends TableReader with Logging {
 
   // Hadoop honors "mapred.map.tasks" as hint, but will ignore when mapred.job.tracker is "local".
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 7856037508412..1fe4cba9571f3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -131,6 +131,7 @@ private[hive] class IsolatedClientLoader(
     name.contains("slf4j") ||
     name.contains("log4j") ||
     name.startsWith("org.apache.spark.") ||
+    (name.startsWith("org.apache.hadoop.") && !name.startsWith("org.apache.hadoop.hive.")) ||
     name.startsWith("scala.") ||
     (name.startsWith("com.google") && !name.startsWith("com.google.cloud")) ||
     name.startsWith("java.lang.") ||
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 58f7fa640e8a9..f936cf565b2bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
 import org.apache.hadoop.hive.serde2.Serializer
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption
 import org.apache.hadoop.hive.serde2.objectinspector._
-import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
+import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
@@ -62,7 +62,7 @@ case class InsertIntoHiveTable(
 
   def output: Seq[Attribute] = Seq.empty
 
-  def saveAsHiveFile(
+  private def saveAsHiveFile(
       rdd: RDD[InternalRow],
       valueClass: Class[_],
       fileSinkConf: FileSinkDesc,
@@ -124,7 +124,7 @@ case class InsertIntoHiveTable(
    *
    * Note: this is run once and then kept to avoid double insertions.
    */
-  protected[sql] lazy val sideEffectResult: Seq[Row] = {
+  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
     // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
@@ -178,6 +178,19 @@ case class InsertIntoHiveTable(
     val jobConf = new JobConf(sc.hiveconf)
     val jobConfSer = new SerializableJobConf(jobConf)
 
+    // When speculation is on and output committer class name contains "Direct", we should warn
+    // users that they may loss data if they are using a direct output committer.
+    val speculationEnabled = sqlContext.sparkContext.conf.getBoolean("spark.speculation", false)
+    val outputCommitterClass = jobConf.get("mapred.output.committer.class", "")
+    if (speculationEnabled && outputCommitterClass.contains("Direct")) {
+      val warningMessage =
+        s"$outputCommitterClass may be an output committer that writes data directly to " +
+          "the final location. Because speculation is enabled, this output committer may " +
+          "cause data loss (see the case in SPARK-10063). If possible, please use a output " +
+          "committer that does not have this behavior (e.g. FileOutputCommitter)."
+      logWarning(warningMessage)
+    }
+
     val writerContainer = if (numDynamicPartitions > 0) {
       val dynamicPartColNames = partitionColumnNames.takeRight(numDynamicPartitions)
       new SparkHiveDynamicPartitionWriterContainer(jobConf, fileSinkConf, dynamicPartColNames)
@@ -254,10 +267,10 @@ case class InsertIntoHiveTable(
     // however for now we return an empty list to simplify compatibility checks with hive, which
     // does not return anything for insert operations.
     // TODO: implement hive compatibility as rules.
-    Seq.empty[Row]
+    Seq.empty[InternalRow]
   }
 
-  override def executeCollect(): Array[Row] = sideEffectResult.toArray
+  override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
     sqlContext.sparkContext.parallelize(sideEffectResult.asInstanceOf[Seq[InternalRow]], 1)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index c7651daffe36e..b30117f0de997 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -24,20 +24,22 @@ import javax.annotation.Nullable
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.ql.exec.{RecordReader, RecordWriter}
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.AbstractSerDe
 import org.apache.hadoop.hive.serde2.objectinspector._
 import org.apache.hadoop.io.Writable
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.hive.{HiveContext, HiveInspectors}
 import org.apache.spark.sql.types.DataType
-import org.apache.spark.util.{CircularBuffer, RedirectThread, Utils}
+import org.apache.spark.util.{CircularBuffer, RedirectThread, SerializableConfiguration, Utils}
 import org.apache.spark.{Logging, TaskContext}
 
 /**
@@ -53,11 +55,13 @@ case class ScriptTransformation(
     script: String,
     output: Seq[Attribute],
     child: SparkPlan,
-    ioschema: HiveScriptIOSchema)(@transient sc: HiveContext)
+    ioschema: HiveScriptIOSchema)(@transient private val sc: HiveContext)
   extends UnaryNode {
 
   override def otherCopyArgs: Seq[HiveContext] = sc :: Nil
 
+  private val serializedHiveConf = new SerializableConfiguration(sc.hiveconf)
+
   protected override def doExecute(): RDD[InternalRow] = {
     def processIterator(inputIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val cmd = List("/bin/bash", "-c", script)
@@ -67,6 +71,7 @@ case class ScriptTransformation(
       val inputStream = proc.getInputStream
       val outputStream = proc.getOutputStream
       val errorStream = proc.getErrorStream
+      val localHiveConf = serializedHiveConf.value
 
       // In order to avoid deadlocks, we need to consume the error output of the child process.
       // To avoid issues caused by large error output, we use a circular buffer to limit the amount
@@ -96,7 +101,8 @@ case class ScriptTransformation(
         outputStream,
         proc,
         stderrBuffer,
-        TaskContext.get()
+        TaskContext.get(),
+        localHiveConf
       )
 
       // This nullability is a performance optimization in order to avoid an Option.foreach() call
@@ -109,6 +115,10 @@ case class ScriptTransformation(
       val outputIterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
         var curLine: String = null
         val scriptOutputStream = new DataInputStream(inputStream)
+
+        @Nullable val scriptOutputReader =
+          ioschema.recordReader(scriptOutputStream, localHiveConf).orNull
+
         var scriptOutputWritable: Writable = null
         val reusedWritableObject: Writable = if (null != outputSerde) {
           outputSerde.getSerializedClass().newInstance
@@ -134,15 +144,25 @@ case class ScriptTransformation(
             }
           } else if (scriptOutputWritable == null) {
             scriptOutputWritable = reusedWritableObject
-            try {
-              scriptOutputWritable.readFields(scriptOutputStream)
-              true
-            } catch {
-              case _: EOFException =>
-                if (writerThread.exception.isDefined) {
-                  throw writerThread.exception.get
-                }
+
+            if (scriptOutputReader != null) {
+              if (scriptOutputReader.next(scriptOutputWritable) <= 0) {
+                writerThread.exception.foreach(throw _)
                 false
+              } else {
+                true
+              }
+            } else {
+              try {
+                scriptOutputWritable.readFields(scriptOutputStream)
+                true
+              } catch {
+                case _: EOFException =>
+                  if (writerThread.exception.isDefined) {
+                    throw writerThread.exception.get
+                  }
+                  false
+              }
             }
           } else {
             true
@@ -210,7 +230,8 @@ private class ScriptTransformationWriterThread(
     outputStream: OutputStream,
     proc: Process,
     stderrBuffer: CircularBuffer,
-    taskContext: TaskContext
+    taskContext: TaskContext,
+    conf: Configuration
   ) extends Thread("Thread-ScriptTransformation-Feed") with Logging {
 
   setDaemon(true)
@@ -224,6 +245,7 @@ private class ScriptTransformationWriterThread(
     TaskContext.setTaskContext(taskContext)
 
     val dataOutputStream = new DataOutputStream(outputStream)
+    @Nullable val scriptInputWriter = ioschema.recordWriter(dataOutputStream, conf).orNull
 
     // We can't use Utils.tryWithSafeFinally here because we also need a `catch` block, so
     // let's use a variable to record whether the `finally` block was hit due to an exception
@@ -250,7 +272,12 @@ private class ScriptTransformationWriterThread(
         } else {
           val writable = inputSerde.serialize(
             row.asInstanceOf[GenericInternalRow].values, inputSoi)
-          prepareWritable(writable, ioschema.outputSerdeProps).write(dataOutputStream)
+
+          if (scriptInputWriter != null) {
+            scriptInputWriter.write(writable)
+          } else {
+            prepareWritable(writable, ioschema.outputSerdeProps).write(dataOutputStream)
+          }
         }
       }
       outputStream.close()
@@ -290,6 +317,8 @@ case class HiveScriptIOSchema (
     outputSerdeClass: Option[String],
     inputSerdeProps: Seq[(String, String)],
     outputSerdeProps: Seq[(String, String)],
+    recordReaderClass: Option[String],
+    recordWriterClass: Option[String],
     schemaLess: Boolean) extends ScriptInputOutputSchema with HiveInspectors {
 
   private val defaultFormat = Map(
@@ -347,4 +376,24 @@ case class HiveScriptIOSchema (
 
     serde
   }
+
+  def recordReader(
+      inputStream: InputStream,
+      conf: Configuration): Option[RecordReader] = {
+    recordReaderClass.map { klass =>
+      val instance = Utils.classForName(klass).newInstance().asInstanceOf[RecordReader]
+      val props = new Properties()
+      props.putAll(outputSerdeProps.toMap.asJava)
+      instance.initialize(inputStream, conf, props)
+      instance
+    }
+  }
+
+  def recordWriter(outputStream: OutputStream, conf: Configuration): Option[RecordWriter] = {
+    recordWriterClass.map { klass =>
+      val instance = Utils.classForName(klass).newInstance().asInstanceOf[RecordWriter]
+      instance.initialize(outputStream, conf)
+      instance
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index d1699dd536817..9f654eed5761c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -235,7 +235,7 @@ case class CreateMetastoreDataSourceAsSelect(
             sqlContext, Some(query.schema.asNullable), partitionColumns, provider, optionsWithPath)
           val createdRelation = LogicalRelation(resolved.relation)
           EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent.toSeq)) match {
-            case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation) =>
+            case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _) =>
               if (l.relation != createdRelation.relation) {
                 val errorDescription =
                   s"Cannot append to table $tableName because the resolved relation does not " +
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index cad02373e5ba1..a85d4db88d7eb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -59,19 +60,36 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
 
       val functionClassName = functionInfo.getFunctionClass.getName
 
-      if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
-      } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
-      } else if (
-        classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveGenericUDAF(new HiveFunctionWrapper(functionClassName), children)
-      } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveUDAF(new HiveFunctionWrapper(functionClassName), children)
-      } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
-        HiveGenericUDTF(new HiveFunctionWrapper(functionClassName), children)
-      } else {
-        sys.error(s"No handler for udf ${functionInfo.getFunctionClass}")
+      // When we instantiate hive UDF wrapper class, we may throw exception if the input expressions
+      // don't satisfy the hive UDF, such as type mismatch, input number mismatch, etc. Here we
+      // catch the exception and throw AnalysisException instead.
+      try {
+        if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
+        } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
+        } else if (
+          classOf[AbstractGenericUDAFResolver].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveUDAFFunction(new HiveFunctionWrapper(functionClassName), children)
+        } else if (classOf[UDAF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveUDAFFunction(
+            new HiveFunctionWrapper(functionClassName), children, isUDAFBridgeRequired = true)
+        } else if (classOf[GenericUDTF].isAssignableFrom(functionInfo.getFunctionClass)) {
+          val udtf = HiveGenericUDTF(new HiveFunctionWrapper(functionClassName), children)
+          udtf.elementTypes // Force it to check input data types.
+          udtf
+        } else {
+          throw new AnalysisException(s"No handler for udf ${functionInfo.getFunctionClass}")
+        }
+      } catch {
+        case analysisException: AnalysisException =>
+          // If the exception is an AnalysisException, just throw it.
+          throw analysisException
+        case throwable: Throwable =>
+          // If there is any other error, we throw an AnalysisException.
+          val errorMessage = s"No handler for Hive udf ${functionInfo.getFunctionClass} " +
+            s"because: ${throwable.getMessage}."
+          throw new AnalysisException(errorMessage)
       }
     }
   }
@@ -132,7 +150,7 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre
   @transient
   private lazy val conversionHelper = new ConversionHelper(method, arguments)
 
-  val dataType = javaClassToDataType(method.getReturnType)
+  override val dataType = javaClassToDataType(method.getReturnType)
 
   @transient
   lazy val returnInspector = ObjectInspectorFactory.getReflectionObjectInspector(
@@ -203,7 +221,7 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr
     new DeferredObjectAdapter(inspect, child.dataType)
   }.toArray[DeferredObject]
 
-  lazy val dataType: DataType = inspectorToDataType(returnInspector)
+  override val dataType: DataType = inspectorToDataType(returnInspector)
 
   override def eval(input: InternalRow): Any = {
     returnInspector // Make sure initialized.
@@ -229,6 +247,12 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr
  * Resolves [[UnresolvedWindowFunction]] to [[HiveWindowFunction]].
  */
 private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
+  private def shouldResolveFunction(
+      unresolvedWindowFunction: UnresolvedWindowFunction,
+      windowSpec: WindowSpecDefinition): Boolean = {
+    unresolvedWindowFunction.childrenResolved && windowSpec.childrenResolved
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p: LogicalPlan if !p.childrenResolved => p
 
@@ -236,9 +260,11 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
     // replaced those WindowSpecReferences.
     case p: LogicalPlan =>
       p transformExpressions {
+        // We will not start to resolve the function unless all arguments are resolved
+        // and all expressions in window spec are fixed.
         case WindowExpression(
-          UnresolvedWindowFunction(name, children),
-          windowSpec: WindowSpecDefinition) =>
+          u @ UnresolvedWindowFunction(name, children),
+          windowSpec: WindowSpecDefinition) if shouldResolveFunction(u, windowSpec) =>
           // First, let's find the window function info.
           val windowFunctionInfo: WindowFunctionInfo =
             Option(FunctionRegistry.getWindowFunctionInfo(name.toLowerCase)).getOrElse(
@@ -254,7 +280,7 @@ private[spark] object ResolveHiveWindowFunction extends Rule[LogicalPlan] {
             // are expressions in Order By clause.
             if (classOf[GenericUDAFRank].isAssignableFrom(functionClass)) {
               if (children.nonEmpty) {
-               throw  new AnalysisException(s"$name does not take input parameters.")
+               throw new AnalysisException(s"$name does not take input parameters.")
               }
               windowSpec.orderSpec.map(_.child)
             } else {
@@ -356,7 +382,7 @@ private[hive] case class HiveWindowFunction(
     evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
   }
 
-  override def dataType: DataType =
+  override val dataType: DataType =
     if (!pivotResult) {
       inspectorToDataType(returnInspector)
     } else {
@@ -441,70 +467,6 @@ private[hive] case class HiveWindowFunction(
     new HiveWindowFunction(funcWrapper, pivotResult, isUDAFBridgeRequired, children)
 }
 
-private[hive] case class HiveGenericUDAF(
-    funcWrapper: HiveFunctionWrapper,
-    children: Seq[Expression]) extends AggregateExpression1
-  with HiveInspectors {
-
-  type UDFType = AbstractGenericUDAFResolver
-
-  @transient
-  protected lazy val resolver: AbstractGenericUDAFResolver = funcWrapper.createFunction()
-
-  @transient
-  protected lazy val objectInspector = {
-    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
-    resolver.getEvaluator(parameterInfo)
-      .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
-  }
-
-  @transient
-  protected lazy val inspectors = children.map(toInspector)
-
-  def dataType: DataType = inspectorToDataType(objectInspector)
-
-  def nullable: Boolean = true
-
-  override def toString: String = {
-    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
-  }
-
-  def newInstance(): HiveUDAFFunction = new HiveUDAFFunction(funcWrapper, children, this)
-}
-
-/** It is used as a wrapper for the hive functions which uses UDAF interface */
-private[hive] case class HiveUDAF(
-    funcWrapper: HiveFunctionWrapper,
-    children: Seq[Expression]) extends AggregateExpression1
-  with HiveInspectors {
-
-  type UDFType = UDAF
-
-  @transient
-  protected lazy val resolver: AbstractGenericUDAFResolver =
-    new GenericUDAFBridge(funcWrapper.createFunction())
-
-  @transient
-  protected lazy val objectInspector = {
-    val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false)
-    resolver.getEvaluator(parameterInfo)
-      .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray)
-  }
-
-  @transient
-  protected lazy val inspectors = children.map(toInspector)
-
-  def dataType: DataType = inspectorToDataType(objectInspector)
-
-  def nullable: Boolean = true
-
-  override def toString: String = {
-    s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
-  }
-
-  def newInstance(): HiveUDAFFunction = new HiveUDAFFunction(funcWrapper, children, this, true)
-}
-
 /**
  * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
  * [[Generator]].  Note that the semantics of Generators do not allow
@@ -540,7 +502,7 @@ private[hive] case class HiveGenericUDTF(
   @transient
   protected lazy val collector = new UDTFCollector
 
-  lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
+  override lazy val elementTypes = outputInspector.getAllStructFieldRefs.asScala.map {
     field => (inspectorToDataType(field.getFieldObjectInspector), true)
   }
 
@@ -584,49 +546,86 @@ private[hive] case class HiveGenericUDTF(
   }
 }
 
+/**
+ * Currently we don't support partial aggregation for queries using Hive UDAF, which may hurt
+ * performance a lot.
+ */
 private[hive] case class HiveUDAFFunction(
     funcWrapper: HiveFunctionWrapper,
-    exprs: Seq[Expression],
-    base: AggregateExpression1,
+    children: Seq[Expression],
     isUDAFBridgeRequired: Boolean = false)
-  extends AggregateFunction1
-  with HiveInspectors {
+  extends AggregateFunction2 with HiveInspectors {
 
-  def this() = this(null, null, null)
+  def this() = this(null, null)
 
-  private val resolver =
+  @transient
+  private lazy val resolver =
     if (isUDAFBridgeRequired) {
       new GenericUDAFBridge(funcWrapper.createFunction[UDAF]())
     } else {
       funcWrapper.createFunction[AbstractGenericUDAFResolver]()
     }
 
-  private val inspectors = exprs.map(toInspector).toArray
+  @transient
+  private lazy val inspectors = children.map(toInspector).toArray
 
-  private val function = {
+  @transient
+  private lazy val functionAndInspector = {
     val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false)
-    resolver.getEvaluator(parameterInfo)
+    val f = resolver.getEvaluator(parameterInfo)
+    f -> f.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
   }
 
-  private val returnInspector = function.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors)
+  @transient
+  private lazy val function = functionAndInspector._1
+
+  @transient
+  private lazy val returnInspector = functionAndInspector._2
 
-  private val buffer =
-    function.getNewAggregationBuffer
+  @transient
+  private[this] var buffer: GenericUDAFEvaluator.AggregationBuffer = _
 
   override def eval(input: InternalRow): Any = unwrap(function.evaluate(buffer), returnInspector)
 
   @transient
-  val inputProjection = new InterpretedProjection(exprs)
+  private lazy val inputProjection = new InterpretedProjection(children)
 
   @transient
-  protected lazy val cached = new Array[AnyRef](exprs.length)
+  private lazy val cached = new Array[AnyRef](children.length)
 
   @transient
-  private lazy val inputDataTypes: Array[DataType] = exprs.map(_.dataType).toArray
+  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
 
-  def update(input: InternalRow): Unit = {
+  // Hive UDAF has its own buffer, so we don't need to occupy a slot in the aggregation
+  // buffer for it.
+  override def bufferSchema: StructType = StructType(Nil)
+
+  override def update(_buffer: MutableRow, input: InternalRow): Unit = {
     val inputs = inputProjection(input)
     function.iterate(buffer, wrap(inputs, inspectors, cached, inputDataTypes))
   }
+
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    throw new UnsupportedOperationException(
+      "Hive UDAF doesn't support partial aggregate")
+  }
+
+  override def cloneBufferAttributes: Seq[Attribute] = Nil
+
+  override def initialize(_buffer: MutableRow): Unit = {
+    buffer = function.getNewAggregationBuffer
+  }
+
+  override def bufferAttributes: Seq[AttributeReference] = Nil
+
+  // We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our
+  // catalyst type checking framework.
+  override def inputTypes: Seq[AbstractDataType] = children.map(_ => AnyDataType)
+
+  override def nullable: Boolean = true
+
+  override def supportsPartial: Boolean = false
+
+  override val dataType: DataType = inspectorToDataType(returnInspector)
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index 8dc796b056a72..c8d6b718045a5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -32,7 +32,6 @@ import org.apache.hadoop.mapred._
 import org.apache.hadoop.hive.common.FileUtils
 
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.sql.Row
 import org.apache.spark.{Logging, SerializableWritable, SparkHadoopWriter}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -45,7 +44,7 @@ import org.apache.spark.util.SerializableJobConf
  * It is based on [[SparkHadoopWriter]].
  */
 private[hive] class SparkHiveWriterContainer(
-    @transient jobConf: JobConf,
+    jobConf: JobConf,
     fileSinkConf: FileSinkDesc)
   extends Logging
   with SparkHadoopMapRedUtil
@@ -122,7 +121,7 @@ private[hive] class SparkHiveWriterContainer(
   }
 
   protected def commit() {
-    SparkHadoopMapRedUtil.commitTask(committer, taskContext, jobID, splitID, attemptID)
+    SparkHadoopMapRedUtil.commitTask(committer, taskContext, jobID, splitID)
   }
 
   private def setIDs(jobId: Int, splitId: Int, attemptId: Int) {
@@ -163,7 +162,7 @@ private[spark] object SparkHiveDynamicPartitionWriterContainer {
 }
 
 private[spark] class SparkHiveDynamicPartitionWriterContainer(
-    @transient jobConf: JobConf,
+    jobConf: JobConf,
     fileSinkConf: FileSinkDesc,
     dynamicPartColNames: Array[String])
   extends SparkHiveWriterContainer(jobConf, fileSinkConf) {
@@ -194,10 +193,10 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer(
     // Better solution is to add a step similar to what Hive FileSinkOperator.jobCloseOp does:
     // calling something like Utilities.mvFileToFinalPath to cleanup the output directory and then
     // load it with loadDynamicPartitions/loadPartition/loadTable.
-    val oldMarker = jobConf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)
-    jobConf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, false)
+    val oldMarker = conf.value.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)
+    conf.value.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, false)
     super.commitJob()
-    jobConf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, oldMarker)
+    conf.value.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, oldMarker)
   }
 
   override def getLocalFileWriter(row: InternalRow, schema: StructType)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
index b3d9f7f71a27d..27193f54d3a91 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -31,11 +31,13 @@ import org.apache.spark.sql.sources._
  * and cannot be used anymore.
  */
 private[orc] object OrcFilters extends Logging {
-  def createFilter(expr: Array[Filter]): Option[SearchArgument] = {
-    expr.reduceOption(And).flatMap { conjunction =>
-      val builder = SearchArgumentFactory.newBuilder()
-      buildSearchArgument(conjunction, builder).map(_.build())
-    }
+  def createFilter(filters: Array[Filter]): Option[SearchArgument] = {
+    for {
+      // Combines all filters with `And`s to produce a single conjunction predicate
+      conjunction <- filters.reduceOption(And)
+      // Then tries to build a single ORC `SearchArgument` for the conjunction predicate
+      builder <- buildSearchArgument(conjunction, SearchArgumentFactory.newBuilder())
+    } yield builder.build()
   }
 
   private def buildSearchArgument(expression: Filter, builder: Builder): Option[Builder] = {
@@ -102,46 +104,32 @@ private[orc] object OrcFilters extends Logging {
           negate <- buildSearchArgument(child, builder.startNot())
         } yield negate.end()
 
-      case EqualTo(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.equals(attribute, _))
+      case EqualTo(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().equals(attribute, value).end())
 
-      case EqualNullSafe(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.nullSafeEquals(attribute, _))
+      case EqualNullSafe(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().nullSafeEquals(attribute, value).end())
 
-      case LessThan(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.lessThan(attribute, _))
+      case LessThan(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().lessThan(attribute, value).end())
 
-      case LessThanOrEqual(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.lessThanEquals(attribute, _))
+      case LessThanOrEqual(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startAnd().lessThanEquals(attribute, value).end())
 
-      case GreaterThan(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.startNot().lessThanEquals(attribute, _).end())
+      case GreaterThan(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startNot().lessThanEquals(attribute, value).end())
 
-      case GreaterThanOrEqual(attribute, value) =>
-        Option(value)
-          .filter(isSearchableLiteral)
-          .map(builder.startNot().lessThan(attribute, _).end())
+      case GreaterThanOrEqual(attribute, value) if isSearchableLiteral(value) =>
+        Some(builder.startNot().lessThan(attribute, value).end())
 
       case IsNull(attribute) =>
-        Some(builder.isNull(attribute))
+        Some(builder.startAnd().isNull(attribute).end())
 
       case IsNotNull(attribute) =>
         Some(builder.startNot().isNull(attribute).end())
 
-      case In(attribute, values) =>
-        Option(values)
-          .filter(_.forall(isSearchableLiteral))
-          .map(builder.in(attribute, _))
+      case In(attribute, values) if values.forall(isSearchableLiteral) =>
+        Some(builder.startAnd().in(attribute, values.map(_.asInstanceOf[AnyRef]): _*).end())
 
       case _ => None
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 1cff5cf9c3543..d1f30e188eafb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -25,15 +25,16 @@ import com.google.common.base.Objects
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit}
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
+import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde, OrcSplit, OrcStruct}
+import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector
+import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfoUtils, StructTypeInfo}
 import org.apache.hadoop.io.{NullWritable, Writable}
 import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.Logging
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.{HadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -77,7 +78,8 @@ private[orc] class OrcOutputWriter(
     }.mkString(":"))
 
     val serde = new OrcSerde
-    serde.initialize(context.getConfiguration, table)
+    val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+    serde.initialize(configuration, table)
     serde
   }
 
@@ -87,21 +89,10 @@ private[orc] class OrcOutputWriter(
       TypeInfoUtils.getTypeInfoFromTypeString(
         HiveMetastoreTypes.toMetastoreType(dataSchema))
 
-    TypeInfoUtils
-      .getStandardJavaObjectInspectorFromTypeInfo(typeInfo)
-      .asInstanceOf[StructObjectInspector]
+    OrcStruct.createObjectInspector(typeInfo.asInstanceOf[StructTypeInfo])
+      .asInstanceOf[SettableStructObjectInspector]
   }
 
-  // Used to hold temporary `Writable` fields of the next row to be written.
-  private val reusableOutputBuffer = new Array[Any](dataSchema.length)
-
-  // Used to convert Catalyst values into Hadoop `Writable`s.
-  private val wrappers = structOI.getAllStructFieldRefs.asScala
-    .zip(dataSchema.fields.map(_.dataType))
-    .map { case (ref, dt) =>
-      wrapperFor(ref.getFieldObjectInspector, dt)
-    }.toArray
-
   // `OrcRecordWriter.close()` creates an empty file if no rows are written at all.  We use this
   // flag to decide whether `OrcRecordWriter.close()` needs to be called.
   private var recordWriterInstantiated = false
@@ -109,9 +100,10 @@ private[orc] class OrcOutputWriter(
   private lazy val recordWriter: RecordWriter[NullWritable, Writable] = {
     recordWriterInstantiated = true
 
-    val conf = context.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
     val uniqueWriteJobId = conf.get("spark.sql.sources.writeJobUUID")
-    val partition = context.getTaskAttemptID.getTaskID.getId
+    val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+    val partition = taskAttemptId.getTaskID.getId
     val filename = f"part-r-$partition%05d-$uniqueWriteJobId.orc"
 
     new OrcOutputFormat().getRecordWriter(
@@ -124,16 +116,32 @@ private[orc] class OrcOutputWriter(
 
   override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
 
-  override protected[sql] def writeInternal(row: InternalRow): Unit = {
+  private def wrapOrcStruct(
+      struct: OrcStruct,
+      oi: SettableStructObjectInspector,
+      row: InternalRow): Unit = {
+    val fieldRefs = oi.getAllStructFieldRefs
     var i = 0
-    while (i < row.numFields) {
-      reusableOutputBuffer(i) = wrappers(i)(row.get(i, dataSchema(i).dataType))
+    while (i < fieldRefs.size) {
+      oi.setStructFieldData(
+        struct,
+        fieldRefs.get(i),
+        wrap(
+          row.get(i, dataSchema(i).dataType),
+          fieldRefs.get(i).getFieldObjectInspector,
+          dataSchema(i).dataType))
       i += 1
     }
+  }
+
+  val cachedOrcStruct = structOI.create().asInstanceOf[OrcStruct]
+
+  override protected[sql] def writeInternal(row: InternalRow): Unit = {
+    wrapOrcStruct(cachedOrcStruct, structOI, row)
 
     recordWriter.write(
       NullWritable.get(),
-      serializer.serialize(reusableOutputBuffer, structOI))
+      serializer.serialize(cachedOrcStruct, structOI))
   }
 
   override def close(): Unit = {
@@ -200,7 +208,7 @@ private[sql] class OrcRelation(
   }
 
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
-    job.getConfiguration match {
+    SparkHadoopUtil.get.getConfigurationFromJobContext(job) match {
       case conf: JobConf =>
         conf.setOutputFormat(classOf[OrcOutputFormat])
       case conf =>
@@ -256,7 +264,7 @@ private[orc] case class OrcTableScan(
     maybeStructOI.map { soi =>
       val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map {
         case (attr, ordinal) =>
-          soi.getStructFieldRef(attr.name.toLowerCase) -> ordinal
+          soi.getStructFieldRef(attr.name) -> ordinal
       }.unzip
       val unwrappers = fieldRefs.map(unwrapperFor)
       // Map each tuple to a row object
@@ -281,7 +289,7 @@ private[orc] case class OrcTableScan(
 
   def execute(): RDD[InternalRow] = {
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-    val conf = job.getConfiguration
+    val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
 
     // Tries to push down filters if ORC filter push-down is enabled
     if (sqlContext.conf.orcFilterPushDown) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 57fea5d8db343..be335a47dcabd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.hive.ql.exec.FunctionRegistry
 import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
-import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.{SQLContext, SQLConf}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.CacheTableCommand
@@ -51,6 +51,11 @@ object TestHive
         // SPARK-8910
         .set("spark.ui.enabled", "false")))
 
+trait TestHiveSingleton {
+  protected val sqlContext: SQLContext = TestHive
+  protected val hiveContext: TestHiveContext = TestHive
+}
+
 /**
  * A locally running test instance of Spark's Hive execution engine.
  *
@@ -111,18 +116,28 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   override def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution(plan)
 
+  // Make sure we set those test specific confs correctly when we create
+  // the SQLConf as well as when we call clear.
   override protected[sql] def createSession(): SQLSession = {
     new this.SQLSession()
   }
 
   protected[hive] class SQLSession extends super.SQLSession {
-    /** Fewer partitions to speed up testing. */
     protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def numShufflePartitions: Int = getConf(SQLConf.SHUFFLE_PARTITIONS, 5)
       // TODO as in unit test, conf.clear() probably be called, all of the value will be cleared.
       // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
       override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
       override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
+
+      clear()
+
+      override def clear(): Unit = {
+        super.clear()
+
+        TestHiveContext.overrideConfs.map {
+          case (key, value) => setConfString(key, value)
+        }
+      }
     }
   }
 
@@ -450,3 +465,15 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
     }
   }
 }
+
+private[hive] object TestHiveContext {
+
+  /**
+   * A map used to store all confs that need to be overridden in sql/hive unit tests.
+   */
+  val overrideConfs: Map[String, String] =
+    Map(
+      // Fewer shuffle partitions to speed up testing.
+      SQLConf.SHUFFLE_PARTITIONS.key -> "5"
+    )
+}
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index 019d8a30266e2..b4bf9eef8fca5 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -40,7 +40,7 @@ public class JavaDataFrameSuite {
 
   DataFrame df;
 
-  private void checkAnswer(DataFrame actual, List<Row> expected) {
+  private static void checkAnswer(DataFrame actual, List<Row> expected) {
     String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
     if (errorMessage != null) {
       Assert.fail(errorMessage);
@@ -52,7 +52,7 @@ public void setUp() throws IOException {
     hc = TestHive$.MODULE$;
     sc = new JavaSparkContext(hc.sparkContext());
 
-    List<String> jsonObjects = new ArrayList<String>(10);
+    List<String> jsonObjects = new ArrayList<>(10);
     for (int i = 0; i < 10; i++) {
       jsonObjects.add("{\"key\":" + i + ", \"value\":\"str" + i + "\"}");
     }
@@ -71,7 +71,7 @@ public void tearDown() throws IOException {
   @Test
   public void saveTableAndQueryIt() {
     checkAnswer(
-      df.select(functions.avg("key").over(
+      df.select(avg("key").over(
         Window.partitionBy("value").orderBy("key").rowsBetween(-1, 1))),
       hc.sql("SELECT avg(key) " +
         "OVER (PARTITION BY value " +
@@ -95,7 +95,7 @@ public void testUDAF() {
           registeredUDAF.apply(col("value")),
           callUDF("mydoublesum", col("value")));
 
-    List<Row> expectedResult = new ArrayList<Row>();
+    List<Row> expectedResult = new ArrayList<>();
     expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0));
     checkAnswer(
       aggregatedDF,
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 4192155975c47..c8d272794d10b 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -53,7 +53,7 @@ public class JavaMetastoreDataSourcesSuite {
   FileSystem fs;
   DataFrame df;
 
-  private void checkAnswer(DataFrame actual, List<Row> expected) {
+  private static void checkAnswer(DataFrame actual, List<Row> expected) {
     String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
     if (errorMessage != null) {
       Assert.fail(errorMessage);
@@ -77,7 +77,7 @@ public void setUp() throws IOException {
       fs.delete(hiveManagedPath, true);
     }
 
-    List<String> jsonObjects = new ArrayList<String>(10);
+    List<String> jsonObjects = new ArrayList<>(10);
     for (int i = 0; i < 10; i++) {
       jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
     }
@@ -97,7 +97,7 @@ public void tearDown() throws IOException {
 
   @Test
   public void saveExternalTableAndQueryIt() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write()
       .format("org.apache.spark.sql.json")
@@ -120,7 +120,7 @@ public void saveExternalTableAndQueryIt() {
 
   @Test
   public void saveExternalTableWithSchemaAndQueryIt() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     options.put("path", path.toString());
     df.write()
       .format("org.apache.spark.sql.json")
@@ -132,7 +132,7 @@ public void saveExternalTableWithSchemaAndQueryIt() {
       sqlContext.sql("SELECT * FROM javaSavedTable"),
       df.collectAsList());
 
-    List<StructField> fields = new ArrayList<StructField>();
+    List<StructField> fields = new ArrayList<>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
     DataFrame loadedDF =
@@ -148,7 +148,7 @@ public void saveExternalTableWithSchemaAndQueryIt() {
 
   @Test
   public void saveTableAndQueryIt() {
-    Map<String, String> options = new HashMap<String, String>();
+    Map<String, String> options = new HashMap<>();
     df.write()
       .format("org.apache.spark.sql.json")
       .mode(SaveMode.Append)
diff --git a/sql/hive/src/test/resources/data/scripts/test_transform.py b/sql/hive/src/test/resources/data/scripts/test_transform.py
new file mode 100755
index 0000000000000..ac6d11d8b919c
--- /dev/null
+++ b/sql/hive/src/test/resources/data/scripts/test_transform.py
@@ -0,0 +1,6 @@
+import sys
+
+delim = sys.argv[1]
+
+for row in sys.stdin:
+    print(delim.join([w + '#' for w in row[:-1].split(delim)]))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 39d315aaeab57..9adb3780a2c55 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -19,14 +19,14 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
-import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.{SaveMode, AnalysisException, DataFrame, QueryTest}
+import org.apache.spark.sql.columnar.InMemoryColumnarTableScan
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}
 import org.apache.spark.storage.RDDBlockId
 import org.apache.spark.util.Utils
 
-class CachedTableSuite extends QueryTest {
+class CachedTableSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext._
 
   def rddIdOf(tableName: String): Int = {
     val executedPlan = table(tableName).queryExecution.executedPlan
@@ -95,18 +95,18 @@ class CachedTableSuite extends QueryTest {
 
   test("correct error on uncache of non-cached table") {
     intercept[IllegalArgumentException] {
-      TestHive.uncacheTable("src")
+      hiveContext.uncacheTable("src")
     }
   }
 
   test("'CACHE TABLE' and 'UNCACHE TABLE' HiveQL statement") {
-    TestHive.sql("CACHE TABLE src")
+    sql("CACHE TABLE src")
     assertCached(table("src"))
-    assert(TestHive.isCached("src"), "Table 'src' should be cached")
+    assert(hiveContext.isCached("src"), "Table 'src' should be cached")
 
-    TestHive.sql("UNCACHE TABLE src")
+    sql("UNCACHE TABLE src")
     assertCached(table("src"), 0)
-    assert(!TestHive.isCached("src"), "Table 'src' should not be cached")
+    assert(!hiveContext.isCached("src"), "Table 'src' should not be cached")
   }
 
   test("CACHE TABLE tableName AS SELECT * FROM anotherTable") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
index 30f5313d2b812..cf737836939f9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
@@ -22,12 +22,12 @@ import scala.util.Try
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.catalyst.util.quietly
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.{AnalysisException, QueryTest}
 
 
-class ErrorPositionSuite extends QueryTest with BeforeAndAfter {
+class ErrorPositionSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter {
+  import hiveContext.implicits._
 
   before {
     Seq((1, 1, 1)).toDF("a", "a", "b").registerTempTable("dupAttributes")
@@ -122,7 +122,7 @@ class ErrorPositionSuite extends QueryTest with BeforeAndAfter {
 
     test(name) {
       val error = intercept[AnalysisException] {
-        quietly(sql(query))
+        quietly(hiveContext.sql(query))
       }
 
       assert(!error.getMessage.contains("Seq("))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
index fb10f8583da99..2e5cae415e54b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameAnalyticsSuite.scala
@@ -19,24 +19,25 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.{DataFrame, QueryTest}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.scalatest.BeforeAndAfterAll
 
 // TODO ideally we should put the test suite into the package `sql`, as
 // `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
 // support the `cube` or `rollup` yet.
-class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll {
+class HiveDataFrameAnalyticsSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext.implicits._
+  import hiveContext.sql
+
   private var testData: DataFrame = _
 
   override def beforeAll() {
     testData = Seq((1, 2), (2, 4)).toDF("a", "b")
-    TestHive.registerDataFrameAsTable(testData, "mytable")
+    hiveContext.registerDataFrameAsTable(testData, "mytable")
   }
 
   override def afterAll(): Unit = {
-    TestHive.dropTempTable("mytable")
+    hiveContext.dropTempTable("mytable")
   }
 
   test("rollup") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala
index 52e782768cb75..f621367eb553b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameJoinSuite.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.{Row, QueryTest}
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-
-class HiveDataFrameJoinSuite extends QueryTest {
+class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext.implicits._
 
   // We should move this into SQL package if we make case sensitivity configurable in SQL.
   test("join - self join auto resolve ambiguity with case insensitivity") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
index c177cbdd991cf..2c98f1c3cc49c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql.hive
 import org.apache.spark.sql.{Row, QueryTest}
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-class HiveDataFrameWindowSuite extends QueryTest {
+class HiveDataFrameWindowSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext.implicits._
+  import hiveContext.sql
 
   test("reuse window partitionBy") {
     val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 574624d501f22..107457f79ec03 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -19,18 +19,15 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.hive.client.{ExternalTable, ManagedTable}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.sources.DataSourceTest
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
-import org.apache.spark.sql.{Row, SaveMode, SQLContext}
-import org.apache.spark.{Logging, SparkFunSuite}
 
-
-class HiveMetastoreCatalogSuite extends SparkFunSuite with Logging {
+class HiveMetastoreCatalogSuite extends SparkFunSuite with TestHiveSingleton {
+  import hiveContext.implicits._
 
   test("struct field should accept underscore in sub-column name") {
     val hiveTypeStr = "struct<a: int, b_1: string, c: string>"
@@ -46,14 +43,15 @@ class HiveMetastoreCatalogSuite extends SparkFunSuite with Logging {
   }
 
   test("duplicated metastore relations") {
-    val df = sql("SELECT * FROM src")
+    val df = hiveContext.sql("SELECT * FROM src")
     logInfo(df.queryExecution.toString)
     df.as('a).join(df.as('b), $"a.key" === $"b.key")
   }
 }
 
-class DataSourceWithHiveMetastoreCatalogSuite extends DataSourceTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
+class DataSourceWithHiveMetastoreCatalogSuite
+  extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext._
   import testImplicits._
 
   private val testDF = range(1, 3).select(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index fe0db5228de16..5596ec6882ea2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -17,15 +17,13 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
-import org.apache.spark.sql.{QueryTest, Row, SQLContext}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.{QueryTest, Row}
 
 case class Cases(lower: String, UPPER: String)
 
-class HiveParquetSuite extends QueryTest with ParquetTest {
-  private val ctx = TestHive
-  override def _sqlContext: SQLContext = ctx
+class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton {
 
   test("Case insensitive attribute names") {
     withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
@@ -53,7 +51,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
   test("Converting Hive to Parquet Table via saveAsParquetFile") {
     withTempPath { dir =>
       sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
-      ctx.read.parquet(dir.getCanonicalPath).registerTempTable("p")
+      hiveContext.read.parquet(dir.getCanonicalPath).registerTempTable("p")
       withTempTable("p") {
         checkAnswer(
           sql("SELECT * FROM src ORDER BY key"),
@@ -66,7 +64,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
     withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
       withTempPath { file =>
         sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
-        ctx.read.parquet(file.getCanonicalPath).registerTempTable("p")
+        hiveContext.read.parquet(file.getCanonicalPath).registerTempTable("p")
         withTempTable("p") {
           // let's do three overwrites for good measure
           sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index dc2d85f48624c..5f1660b62d418 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -29,7 +29,7 @@ import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{SQLContext, QueryTest}
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
 import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
 import org.apache.spark.sql.types.DecimalType
@@ -139,7 +139,7 @@ class HiveSparkSubmitSuite
     new ProcessOutputCapturer(process.getErrorStream, captureOutput("stderr")).start()
 
     try {
-      val exitCode = failAfter(180.seconds) { process.waitFor() }
+      val exitCode = failAfter(300.seconds) { process.waitFor() }
       if (exitCode != 0) {
         // include logs in output. Note that logging is async and may not have completed
         // at the time this exception is raised
@@ -173,6 +173,7 @@ object SparkSubmitClassLoaderTest extends Logging {
   def main(args: Array[String]) {
     Utils.configTestLog4j("INFO")
     val conf = new SparkConf()
+    conf.set("spark.ui.enabled", "false")
     val sc = new SparkContext(conf)
     val hiveContext = new TestHiveContext(sc)
     val df = hiveContext.createDataFrame((1 to 100).map(i => (i, i))).toDF("i", "j")
@@ -264,6 +265,7 @@ object SparkSQLConfTest extends Logging {
       // For this simple test, we do not really clone this object.
       override def clone: SparkConf = this
     }
+    conf.set("spark.ui.enabled", "false")
     val sc = new SparkContext(conf)
     val hiveContext = new TestHiveContext(sc)
     // Run a simple command to make sure all lazy vals in hiveContext get instantiated.
@@ -272,20 +274,24 @@ object SparkSQLConfTest extends Logging {
   }
 }
 
-object SPARK_9757 extends QueryTest with Logging {
+object SPARK_9757 extends QueryTest {
+  import org.apache.spark.sql.functions._
+
+  protected var sqlContext: SQLContext = _
+
   def main(args: Array[String]): Unit = {
     Utils.configTestLog4j("INFO")
 
     val sparkContext = new SparkContext(
       new SparkConf()
         .set("spark.sql.hive.metastore.version", "0.13.1")
-        .set("spark.sql.hive.metastore.jars", "maven"))
+        .set("spark.sql.hive.metastore.jars", "maven")
+        .set("spark.ui.enabled", "false"))
 
     val hiveContext = new TestHiveContext(sparkContext)
+    sqlContext = hiveContext
     import hiveContext.implicits._
 
-    import org.apache.spark.sql.functions._
-
     val dir = Utils.createTempDir()
     dir.delete()
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index d33e81227db88..81ee9ba71beb6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -24,28 +24,25 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.{QueryTest, _}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-/* Implicits */
-import org.apache.spark.sql.hive.test.TestHive._
-
 case class TestData(key: Int, value: String)
 
 case class ThreeCloumntable(key: Int, value: String, key1: String)
 
-class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
-
+class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter {
+  import hiveContext.implicits._
+  import hiveContext.sql
 
-  val testData = TestHive.sparkContext.parallelize(
+  val testData = hiveContext.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString))).toDF()
 
   before {
     // Since every we are doing tests for DDL statements,
     // it is better to reset before every test.
-    TestHive.reset()
+    hiveContext.reset()
     // Register the testData, which will be used in every test.
     testData.registerTempTable("testData")
   }
@@ -84,9 +81,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Double create fails when allowExisting = false") {
     sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
 
-    val message = intercept[QueryExecutionException] {
+    intercept[QueryExecutionException] {
       sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
-    }.getMessage
+    }
   }
 
   test("Double create does not fail when allowExisting = true") {
@@ -96,9 +93,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
 
   test("SPARK-4052: scala.collection.Map as value type of MapType") {
     val schema = StructType(StructField("m", MapType(StringType, StringType), true) :: Nil)
-    val rowRDD = TestHive.sparkContext.parallelize(
+    val rowRDD = hiveContext.sparkContext.parallelize(
       (1 to 100).map(i => Row(scala.collection.mutable.HashMap(s"key$i" -> s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithMapValue")
     sql("CREATE TABLE hiveTableWithMapValue(m MAP <STRING, STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue")
@@ -169,8 +166,8 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Insert ArrayType.containsNull == false") {
     val schema = StructType(Seq(
       StructField("a", ArrayType(StringType, containsNull = false))))
-    val rowRDD = TestHive.sparkContext.parallelize((1 to 100).map(i => Row(Seq(s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val rowRDD = hiveContext.sparkContext.parallelize((1 to 100).map(i => Row(Seq(s"value$i"))))
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithArrayValue")
     sql("CREATE TABLE hiveTableWithArrayValue(a Array <STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithArrayValue SELECT a FROM tableWithArrayValue")
@@ -185,9 +182,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Insert MapType.valueContainsNull == false") {
     val schema = StructType(Seq(
       StructField("m", MapType(StringType, StringType, valueContainsNull = false))))
-    val rowRDD = TestHive.sparkContext.parallelize(
+    val rowRDD = hiveContext.sparkContext.parallelize(
       (1 to 100).map(i => Row(Map(s"key$i" -> s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithMapValue")
     sql("CREATE TABLE hiveTableWithMapValue(m Map <STRING, STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue")
@@ -202,9 +199,9 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   test("Insert StructType.fields.exists(_.nullable == false)") {
     val schema = StructType(Seq(
       StructField("s", StructType(Seq(StructField("f", StringType, nullable = false))))))
-    val rowRDD = TestHive.sparkContext.parallelize(
+    val rowRDD = hiveContext.sparkContext.parallelize(
       (1 to 100).map(i => Row(Row(s"value$i"))))
-    val df = TestHive.createDataFrame(rowRDD, schema)
+    val df = hiveContext.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithStructValue")
     sql("CREATE TABLE hiveTableWithStructValue(s Struct <f: STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithStructValue SELECT s FROM tableWithStructValue")
@@ -217,11 +214,11 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   }
 
   test("SPARK-5498:partition schema does not match table schema") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       (1 to 10).map(i => TestData(i, i.toString))).toDF()
     testData.registerTempTable("testData")
 
-    val testDatawithNull = TestHive.sparkContext.parallelize(
+    val testDatawithNull = hiveContext.sparkContext.parallelize(
       (1 to 10).map(i => ThreeCloumntable(i, i.toString, null))).toDF()
 
     val tmpDir = Utils.createTempDir()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index d3388a9429e41..579631df772b5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -19,17 +19,15 @@ package org.apache.spark.sql.hive
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.Row
 
-class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
+class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext._
+  import hiveContext.implicits._
 
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
-
-  val df =
-    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
+  val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")
 
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 20a50586d5201..d3565380005a0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -22,15 +22,11 @@ import java.io.{IOException, File}
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
@@ -39,10 +35,9 @@ import org.apache.spark.util.Utils
 /**
  * Tests for persisting tables created though the data sources API into the metastore.
  */
-class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll
-  with Logging {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext._
+  import hiveContext.implicits._
 
   var jsonFilePath: String = _
 
@@ -575,7 +570,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
             Row(3) :: Row(4) :: Nil)
 
           table("test_parquet_ctas").queryExecution.optimizedPlan match {
-            case LogicalRelation(p: ParquetRelation) => // OK
+            case LogicalRelation(p: ParquetRelation, _) => // OK
             case _ =>
               fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation]}")
           }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 997c667ec0d1b..f16c257ab5ab4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -17,20 +17,16 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.{AnalysisException, QueryTest, SQLContext, SaveMode}
+import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}
 
-class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
-  override val _sqlContext: HiveContext = TestHive
-  private val sqlContext = _sqlContext
-
-  private val df = sqlContext.range(10).coalesce(1)
+class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  private lazy val df = sqlContext.range(10).coalesce(1)
 
   private def checkTablePath(dbName: String, tableName: String): Unit = {
-    // val hiveContext = sqlContext.asInstanceOf[HiveContext]
-    val metastoreTable = sqlContext.catalog.client.getTable(dbName, tableName)
-    val expectedPath = sqlContext.catalog.client.getDatabase(dbName).location + "/" + tableName
+    val metastoreTable = hiveContext.catalog.client.getTable(dbName, tableName)
+    val expectedPath = hiveContext.catalog.client.getDatabase(dbName).location + "/" + tableName
 
     assert(metastoreTable.serdeProperties("path") === expectedPath)
   }
@@ -220,7 +216,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
 
           df.write.parquet(s"$path/p=2")
           sql("ALTER TABLE t ADD PARTITION (p=2)")
-          sqlContext.refreshTable("t")
+          hiveContext.refreshTable("t")
           checkAnswer(
             sqlContext.table("t"),
             df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2))))
@@ -252,7 +248,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
 
         df.write.parquet(s"$path/p=2")
         sql(s"ALTER TABLE $db.t ADD PARTITION (p=2)")
-        sqlContext.refreshTable(s"$db.t")
+        hiveContext.refreshTable(s"$db.t")
         checkAnswer(
           sqlContext.table(s"$db.t"),
           df.withColumn("p", lit(1)).unionAll(df.withColumn("p", lit(2))))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
index 91d7a48208e8d..49aab85cf1aaf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -18,38 +18,20 @@
 package org.apache.spark.sql.hive
 
 import java.sql.Timestamp
-import java.util.{Locale, TimeZone}
 
 import org.apache.hadoop.hive.conf.HiveConf
-import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.execution.datasources.parquet.ParquetCompatibilityTest
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.{Row, SQLConf, SQLContext}
-
-class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with BeforeAndAfterAll {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+import org.apache.spark.sql.{Row, SQLConf}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
+class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHiveSingleton {
   /**
    * Set the staging directory (and hence path to ignore Parquet files under)
    * to that set by [[HiveConf.ConfVars.STAGINGDIR]].
    */
   private val stagingDir = new HiveConf().getVar(HiveConf.ConfVars.STAGINGDIR)
 
-  private val originalTimeZone = TimeZone.getDefault
-  private val originalLocale = Locale.getDefault
-
-  protected override def beforeAll(): Unit = {
-    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
-    Locale.setDefault(Locale.US)
-  }
-
-  override protected def afterAll(): Unit = {
-    TimeZone.setDefault(originalTimeZone)
-    Locale.setDefault(originalLocale)
-  }
-
   override protected def logParquetSchema(path: String): Unit = {
     val schema = readParquetSchema(path, { path =>
       !path.getName.startsWith("_") && !path.getName.startsWith(stagingDir)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 1cc8a93e83411..f542a5a02508c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -18,22 +18,18 @@
 package org.apache.spark.sql.hive
 
 import com.google.common.io.Files
-import org.apache.spark.sql.test.SQLTestUtils
 
-import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.util.Utils
+import org.apache.spark.sql.{QueryTest, _}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 
+class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext.implicits._
 
-class QueryPartitionSuite extends QueryTest with SQLTestUtils {
-
-  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
-  import ctx.implicits._
-
-  protected def _sqlContext = ctx
-
-  test("SPARK-5068: query data when path doesn't exist"){
+  test("SPARK-5068: query data when path doesn't exist") {
     withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
-      val testData = ctx.sparkContext.parallelize(
+      val testData = sparkContext.parallelize(
         (1 to 10).map(i => TestData(i, i.toString))).toDF()
       testData.registerTempTable("testData")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index e4fec7e2c8a2a..6a692d6fce562 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -17,24 +17,15 @@
 
 package org.apache.spark.sql.hive
 
-import org.scalatest.BeforeAndAfterAll
-
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{Row, SQLConf, QueryTest}
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.execution._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
-
-  private lazy val ctx: HiveContext = {
-    val ctx = org.apache.spark.sql.hive.test.TestHive
-    ctx.reset()
-    ctx.cacheTables = false
-    ctx
-  }
-
-  import ctx.sql
+class StatisticsSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext.sql
 
   test("parse analyze commands") {
     def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
@@ -54,9 +45,6 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       }
     }
 
-    // Ensure session state is initialized.
-    ctx.parseSql("use default")
-
     assertAnalyzeCommand(
       "ANALYZE TABLE Table1 COMPUTE STATISTICS",
       classOf[HiveNativeCommand])
@@ -80,7 +68,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
-      ctx.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
+      hiveContext.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -114,7 +102,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         |SELECT * FROM src
       """.stripMargin).collect()
 
-    assert(queryTotalSize("analyzeTable_part") === ctx.conf.defaultSizeInBytes)
+    assert(queryTotalSize("analyzeTable_part") === hiveContext.conf.defaultSizeInBytes)
 
     sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
 
@@ -125,9 +113,9 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
     // Try to analyze a temp table
     sql("""SELECT * FROM src""").registerTempTable("tempTable")
     intercept[UnsupportedOperationException] {
-      ctx.analyze("tempTable")
+      hiveContext.analyze("tempTable")
     }
-    ctx.catalog.unregisterTable(Seq("tempTable"))
+    hiveContext.catalog.unregisterTable(Seq("tempTable"))
   }
 
   test("estimates the size of a test MetastoreRelation") {
@@ -155,8 +143,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
       val sizes = df.queryExecution.analyzed.collect {
         case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
       }
-      assert(sizes.size === 2 && sizes(0) <= ctx.conf.autoBroadcastJoinThreshold
-        && sizes(1) <= ctx.conf.autoBroadcastJoinThreshold,
+      assert(sizes.size === 2 && sizes(0) <= hiveContext.conf.autoBroadcastJoinThreshold
+        && sizes(1) <= hiveContext.conf.autoBroadcastJoinThreshold,
         s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
       // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -167,8 +155,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
       checkAnswer(df, expectedAnswer) // check correctness of output
 
-      ctx.conf.settings.synchronized {
-        val tmp = ctx.conf.autoBroadcastJoinThreshold
+      hiveContext.conf.settings.synchronized {
+        val tmp = hiveContext.conf.autoBroadcastJoinThreshold
 
         sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1""")
         df = sql(query)
@@ -211,8 +199,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         .isAssignableFrom(r.getClass) =>
         r.statistics.sizeInBytes
     }
-    assert(sizes.size === 2 && sizes(1) <= ctx.conf.autoBroadcastJoinThreshold
-      && sizes(0) <= ctx.conf.autoBroadcastJoinThreshold,
+    assert(sizes.size === 2 && sizes(1) <= hiveContext.conf.autoBroadcastJoinThreshold
+      && sizes(0) <= hiveContext.conf.autoBroadcastJoinThreshold,
       s"query should contain two relations, each of which has size smaller than autoConvertSize")
 
     // Using `sparkPlan` because for relevant patterns in HashJoin to be
@@ -225,8 +213,8 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     checkAnswer(df, answer) // check correctness of output
 
-    ctx.conf.settings.synchronized {
-      val tmp = ctx.conf.autoBroadcastJoinThreshold
+    hiveContext.conf.settings.synchronized {
+      val tmp = hiveContext.conf.autoBroadcastJoinThreshold
 
       sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key}=-1")
       df = sql(leftSemiJoinQuery)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 7ee1c8d13aa3f..3ab4576811194 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -18,18 +18,18 @@
 package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 case class FunctionResult(f1: String, f2: String)
 
-class UDFSuite extends QueryTest {
-  private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
+class UDFSuite extends QueryTest with TestHiveSingleton {
 
   test("UDF case insensitive") {
-    ctx.udf.register("random0", () => { Math.random() })
-    ctx.udf.register("RANDOM1", () => { Math.random() })
-    ctx.udf.register("strlenScala", (_: String).length + (_: Int))
-    assert(ctx.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
-    assert(ctx.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
-    assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
+    hiveContext.udf.register("random0", () => { Math.random() })
+    hiveContext.udf.register("RANDOM1", () => { Math.random() })
+    hiveContext.udf.register("strlenScala", (_: String).length + (_: Int))
+    assert(hiveContext.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
+    assert(hiveContext.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
+    assert(hiveContext.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 4886a85948367..24b1846923c77 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -17,19 +17,57 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.scalatest.BeforeAndAfterAll
+import scala.collection.JavaConverters._
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.aggregate
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
-  override def _sqlContext: SQLContext = TestHive
-  protected val sqlContext = _sqlContext
-  import sqlContext.implicits._
+class ScalaAggregateFunction(schema: StructType) extends UserDefinedAggregateFunction {
+
+  def inputSchema: StructType = schema
+
+  def bufferSchema: StructType = schema
+
+  def dataType: DataType = schema
+
+  def deterministic: Boolean = true
+
+  def initialize(buffer: MutableAggregationBuffer): Unit = {
+    (0 until schema.length).foreach { i =>
+      buffer.update(i, null)
+    }
+  }
+
+  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
+    if (!input.isNullAt(0) && input.getInt(0) == 50) {
+      (0 until schema.length).foreach { i =>
+        buffer.update(i, input.get(i))
+      }
+    }
+  }
+
+  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+    if (!buffer2.isNullAt(0) && buffer2.getInt(0) == 50) {
+      (0 until schema.length).foreach { i =>
+        buffer1.update(i, buffer2.get(i))
+      }
+    }
+  }
+
+  def evaluate(buffer: Row): Any = {
+    Row.fromSeq(buffer.toSeq)
+  }
+}
+
+abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import testImplicits._
 
   var originalUseAggregate2: Boolean = _
 
@@ -69,7 +107,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be
     data2.write.saveAsTable("agg2")
 
     val emptyDF = sqlContext.createDataFrame(
-      sqlContext.sparkContext.emptyRDD[Row],
+      sparkContext.emptyRDD[Row],
       StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
     emptyDF.registerTempTable("emptyTable")
 
@@ -511,40 +549,69 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Be
       }.getMessage
       assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
     }
+  }
 
-    // TODO: once we support Hive UDAF in the new interface,
-    // we can remove the following two tests.
-    withSQLConf("spark.sql.useAggregate2" -> "true") {
-      val errorMessage = intercept[AnalysisException] {
-        sqlContext.sql(
-          """
-            |SELECT
-            |  key,
-            |  mydoublesum(value + 1.5 * key),
-            |  stddev_samp(value)
-            |FROM agg1
-            |GROUP BY key
-          """.stripMargin).collect()
-      }.getMessage
-      assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
-
-      // This will fall back to the old aggregate
-      val newAggregateOperators = sqlContext.sql(
-        """
-          |SELECT
-          |  key,
-          |  sum(value + 1.5 * key),
-          |  stddev_samp(value)
-          |FROM agg1
-          |GROUP BY key
-        """.stripMargin).queryExecution.executedPlan.collect {
-        case agg: aggregate.SortBasedAggregate => agg
-        case agg: aggregate.TungstenAggregate => agg
+  test("udaf with all data types") {
+    val struct =
+      StructType(
+        StructField("f1", FloatType, true) ::
+          StructField("f2", ArrayType(BooleanType), true) :: Nil)
+    val dataTypes = Seq(StringType, BinaryType, NullType, BooleanType,
+      ByteType, ShortType, IntegerType, LongType,
+      FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+      DateType, TimestampType,
+      ArrayType(IntegerType), MapType(StringType, LongType), struct,
+      new MyDenseVectorUDT())
+    // Right now, we will use SortBasedAggregate to handle UDAFs.
+    // UnsafeRow.mutableFieldTypes.asScala.toSeq will trigger SortBasedAggregate to use
+    // UnsafeRow as the aggregation buffer. While, dataTypes will trigger
+    // SortBasedAggregate to use a safe row as the aggregation buffer.
+    Seq(dataTypes, UnsafeRow.mutableFieldTypes.asScala.toSeq).foreach { dataTypes =>
+      val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
+        StructField(s"col$index", dataType, nullable = true)
       }
-      val message =
-        "We should fallback to the old aggregation code path if " +
-          "there is any aggregate function that cannot be converted to the new interface."
-      assert(newAggregateOperators.isEmpty, message)
+      // The schema used for data generator.
+      val schemaForGenerator = StructType(fields)
+      // The schema used for the DataFrame df.
+      val schema = StructType(StructField("id", IntegerType) +: fields)
+
+      logInfo(s"Testing schema: ${schema.treeString}")
+
+      val udaf = new ScalaAggregateFunction(schema)
+      // Generate data at the driver side. We need to materialize the data first and then
+      // create RDD.
+      val maybeDataGenerator =
+        RandomDataGenerator.forType(
+          dataType = schemaForGenerator,
+          nullable = true,
+          seed = Some(System.nanoTime()))
+      val dataGenerator =
+        maybeDataGenerator
+          .getOrElse(fail(s"Failed to create data generator for schema $schemaForGenerator"))
+      val data = (1 to 50).map { i =>
+        dataGenerator.apply() match {
+          case row: Row => Row.fromSeq(i +: row.toSeq)
+          case null => Row.fromSeq(i +: Seq.fill(schemaForGenerator.length)(null))
+          case other =>
+            fail(s"Row or null is expected to be generated, " +
+              s"but a ${other.getClass.getCanonicalName} is generated.")
+        }
+      }
+
+      // Create a DF for the schema with random data.
+      val rdd = sqlContext.sparkContext.parallelize(data, 1)
+      val df = sqlContext.createDataFrame(rdd, schema)
+
+      val allColumns = df.schema.fields.map(f => col(f.name))
+      val expectedAnaswer =
+        data
+          .find(r => r.getInt(0) == 50)
+          .getOrElse(fail("A row with id 50 should be the expected answer."))
+      checkAnswer(
+        df.groupBy().agg(udaf(allColumns: _*)),
+        // udaf returns a Row as the output value.
+        Row(expectedAnaswer)
+      )
     }
   }
 }
@@ -597,7 +664,7 @@ class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQue
     sqlContext.conf.unsetConf("spark.sql.TungstenAggregate.testFallbackStartsAt")
   }
 
-  override protected def checkAnswer(actual: DataFrame, expectedAnswer: Seq[Row]): Unit = {
+  override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
     (0 to 2).foreach { fallbackStartsAt =>
       sqlContext.setConf(
         "spark.sql.TungstenAggregate.testFallbackStartsAt",
@@ -605,6 +672,7 @@ class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQue
 
       // Create a new df to make sure its physical operator picks up
       // spark.sql.TungstenAggregate.testFallbackStartsAt.
+      // todo: remove it?
       val newActual = DataFrame(sqlContext, actual.logicalPlan)
 
       QueryTest.checkAnswer(newActual, expectedAnswer) match {
@@ -626,12 +694,12 @@ class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQue
   }
 
   // Override it to make sure we call the actually overridden checkAnswer.
-  override protected def checkAnswer(df: DataFrame, expectedAnswer: Row): Unit = {
+  override protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = {
     checkAnswer(df, Seq(expectedAnswer))
   }
 
   // Override it to make sure we call the actually overridden checkAnswer.
-  override protected def checkAnswer(df: DataFrame, expectedAnswer: DataFrame): Unit = {
+  override protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = {
     checkAnswer(df, expectedAnswer.collect())
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
index b0d3dd44daedc..e38d1eb5779fe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ConcurrentHiveSuite.scala
@@ -25,8 +25,10 @@ class ConcurrentHiveSuite extends SparkFunSuite with BeforeAndAfterAll {
   ignore("multiple instances not supported") {
     test("Multiple Hive Instances") {
       (1 to 10).map { i =>
+        val conf = new SparkConf()
+        conf.set("spark.ui.enabled", "false")
         val ts =
-          new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", new SparkConf()))
+          new TestHiveContext(new SparkContext("local", s"TestSQLContext$i", conf))
         ts.executeSql("SHOW TABLES").toRdd.collect()
         ts.executeSql("SELECT * FROM src").toRdd.collect()
         ts.executeSql("SHOW TABLES").toRdd.collect()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index 4d45249d9c6b8..aa95ba94fa873 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -23,7 +23,7 @@ import scala.util.control.NonFatal
 
 import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
 
-import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util._
@@ -42,7 +42,7 @@ import org.apache.spark.sql.hive.test.TestHive
  * configured using system properties.
  */
 abstract class HiveComparisonTest
-  extends SparkFunSuite with BeforeAndAfterAll with GivenWhenThen with Logging {
+  extends SparkFunSuite with BeforeAndAfterAll with GivenWhenThen {
 
   /**
    * When set, any cache files that result in test failures will be deleted.  Used when the test
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index 11d7a872dff09..94162da4eae1a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -17,17 +17,14 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.sql.{SQLContext, QueryTest}
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 /**
  * A set of tests that validates support for Hive Explain command.
  */
-class HiveExplainSuite extends QueryTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("explain extended command") {
     checkExistence(sql(" explain   select * from src where key=123 "), true,
@@ -83,7 +80,7 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils {
   test("SPARK-6212: The EXPLAIN output of CTAS only shows the analyzed plan") {
     withTempTable("jt") {
       val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""))
-      read.json(rdd).registerTempTable("jt")
+      hiveContext.read.json(rdd).registerTempTable("jt")
       val outputs = sql(
         s"""
            |EXPLAIN EXTENDED
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
index efbef68cd4447..0d4c7f86b315a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
@@ -18,14 +18,16 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.{Row, QueryTest}
-import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
 
 /**
  * A set of tests that validates commands can also be queried by like a table
  */
-class HiveOperatorQueryableSuite extends QueryTest {
+class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton {
+  import hiveContext._
+
   test("SPARK-5324 query result of describe command") {
-    loadTestTable("src")
+    hiveContext.loadTestTable("src")
 
     // register a describe command to be a temp table
     sql("desc src").registerTempTable("mydesc")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
index ba56a8a6b689c..cd055f9eca37e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
@@ -21,11 +21,11 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-class HivePlanTest extends QueryTest {
-  import TestHive._
-  import TestHive.implicits._
+class HivePlanTest extends QueryTest with TestHiveSingleton {
+  import hiveContext.sql
+  import hiveContext.implicits._
 
   test("udf constant folding") {
     Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 83f9f3eaa3a5e..fe63ad5683195 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.Cast
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.hive._
+import org.apache.spark.sql.hive.test.TestHiveContext
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 
@@ -1104,18 +1105,19 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
 
     // "SET" itself returns all config variables currently specified in SQLConf.
     // TODO: Should we be listing the default here always? probably...
-    assert(sql("SET").collect().size == 0)
+    assert(sql("SET").collect().size === TestHiveContext.overrideConfs.size)
 
+    val defaults = collectResults(sql("SET"))
     assertResult(Set(testKey -> testVal)) {
       collectResults(sql(s"SET $testKey=$testVal"))
     }
 
-    assert(hiveconf.get(testKey, "") == testVal)
-    assertResult(Set(testKey -> testVal))(collectResults(sql("SET")))
+    assert(hiveconf.get(testKey, "") === testVal)
+    assertResult(defaults ++ Set(testKey -> testVal))(collectResults(sql("SET")))
 
     sql(s"SET ${testKey + testKey}=${testVal + testVal}")
     assert(hiveconf.get(testKey + testKey, "") == testVal + testVal)
-    assertResult(Set(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
+    assertResult(defaults ++ Set(testKey -> testVal, (testKey + testKey) -> (testVal + testVal))) {
       collectResults(sql("SET"))
     }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 9c10ffe1113dc..5f9a447759b48 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -21,14 +21,15 @@ import java.io.{DataInput, DataOutput}
 import java.util.{ArrayList, Arrays, Properties}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.ql.udf.generic.{GenericUDAFAverage, GenericUDF}
+import org.apache.hadoop.hive.ql.udf.UDAFPercentile
+import org.apache.hadoop.hive.ql.udf.generic.{GenericUDFOPAnd, GenericUDTFExplode, GenericUDAFAverage, GenericUDF}
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory}
 import org.apache.hadoop.hive.serde2.{AbstractSerDe, SerDeStats}
 import org.apache.hadoop.io.Writable
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SQLConf}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 import org.apache.spark.util.Utils
 
@@ -43,10 +44,10 @@ case class ListStringCaseClass(l: Seq[String])
 /**
  * A test suite for Hive custom UDFs.
  */
-class HiveUDFSuite extends QueryTest {
+class HiveUDFSuite extends QueryTest with TestHiveSingleton {
 
-  import TestHive.{udf, sql}
-  import TestHive.implicits._
+  import hiveContext.{udf, sql}
+  import hiveContext.implicits._
 
   test("spark sql udf test that returns a struct") {
     udf.register("getStruct", (_: Int) => Fields(1, 2, 3, 4, 5))
@@ -123,21 +124,21 @@ class HiveUDFSuite extends QueryTest {
           |           "value", value)).value FROM src
         """.stripMargin), Seq(Row("val_0")))
     }
-    val codegenDefault = TestHive.getConf(SQLConf.CODEGEN_ENABLED)
-    TestHive.setConf(SQLConf.CODEGEN_ENABLED, true)
+    val codegenDefault = hiveContext.getConf(SQLConf.CODEGEN_ENABLED)
+    hiveContext.setConf(SQLConf.CODEGEN_ENABLED, true)
     testOrderInStruct()
-    TestHive.setConf(SQLConf.CODEGEN_ENABLED, false)
+    hiveContext.setConf(SQLConf.CODEGEN_ENABLED, false)
     testOrderInStruct()
-    TestHive.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault)
+    hiveContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault)
   }
 
-  test("SPARK-6409 UDAFAverage test") {
+  test("SPARK-6409 UDAF Average test") {
     sql(s"CREATE TEMPORARY FUNCTION test_avg AS '${classOf[GenericUDAFAverage].getName}'")
     checkAnswer(
       sql("SELECT test_avg(1), test_avg(substr(value,5)) FROM src"),
       Seq(Row(1.0, 260.182)))
     sql("DROP TEMPORARY FUNCTION IF EXISTS test_avg")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("SPARK-2693 udaf aggregates test") {
@@ -157,7 +158,7 @@ class HiveUDFSuite extends QueryTest {
    }
 
   test("UDFIntegerToString") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil).toDF()
     testData.registerTempTable("integerTable")
 
@@ -168,11 +169,11 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row("1"), Row("2")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFIntegerToString")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToListString") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToListString AS '${classOf[UDFToListString].getName}'")
@@ -183,11 +184,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch a component type in List<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListString")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToListInt") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToListInt AS '${classOf[UDFToListInt].getName}'")
@@ -198,11 +199,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch a component type in List<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListInt")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToStringIntMap") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToStringIntMap " +
@@ -214,11 +215,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch key and value types in Map<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToStringIntMap")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFToIntIntMap") {
-    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    val testData = hiveContext.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
     testData.registerTempTable("inputTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFToIntIntMap " +
@@ -230,11 +231,11 @@ class HiveUDFSuite extends QueryTest {
       "JVM type erasure makes spark fail to catch key and value types in Map<>;")
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToIntIntMap")
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFListListInt") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
       ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) :: Nil).toDF()
@@ -246,11 +247,11 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row(0), Row(2), Row(13)))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFListListInt")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFListString") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       ListStringCaseClass(Seq("a", "b", "c")) ::
       ListStringCaseClass(Seq("d", "e")) :: Nil).toDF()
     testData.registerTempTable("listStringTable")
@@ -261,11 +262,11 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row("a,b,c"), Row("d,e")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFListString")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFStringString") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       StringCaseClass("world") :: StringCaseClass("goodbye") :: Nil).toDF()
     testData.registerTempTable("stringTable")
 
@@ -280,11 +281,11 @@ class HiveUDFSuite extends QueryTest {
 
     sql("DROP TEMPORARY FUNCTION IF EXISTS testStringStringUDF")
 
-    TestHive.reset()
+    hiveContext.reset()
   }
 
   test("UDFTwoListList") {
-    val testData = TestHive.sparkContext.parallelize(
+    val testData = hiveContext.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
       ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) ::
@@ -297,7 +298,63 @@ class HiveUDFSuite extends QueryTest {
       Seq(Row("0, 0"), Row("2, 2"), Row("13, 13")))
     sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList")
 
-    TestHive.reset()
+    hiveContext.reset()
+  }
+
+  test("Hive UDFs with insufficient number of input arguments should trigger an analysis error") {
+    Seq((1, 2)).toDF("a", "b").registerTempTable("testUDF")
+
+    {
+      // HiveSimpleUDF
+      sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDFTwoListList() FROM testUDF")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList")
+    }
+
+    {
+      // HiveGenericUDF
+      sql(s"CREATE TEMPORARY FUNCTION testUDFAnd AS '${classOf[GenericUDFOPAnd].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDFAnd() FROM testUDF")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFAnd")
+    }
+
+    {
+      // Hive UDAF
+      sql(s"CREATE TEMPORARY FUNCTION testUDAFPercentile AS '${classOf[UDAFPercentile].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDAFPercentile(a) FROM testUDF GROUP BY b")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFPercentile")
+    }
+
+    {
+      // AbstractGenericUDAFResolver
+      sql(s"CREATE TEMPORARY FUNCTION testUDAFAverage AS '${classOf[GenericUDAFAverage].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDAFAverage() FROM testUDF GROUP BY b")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFAverage")
+    }
+
+    {
+      // Hive UDTF
+      sql(s"CREATE TEMPORARY FUNCTION testUDTFExplode AS '${classOf[GenericUDTFExplode].getName}'")
+      val message = intercept[AnalysisException] {
+        sql("SELECT testUDTFExplode() FROM testUDF")
+      }.getMessage
+      assert(message.contains("No handler for Hive udf"))
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDTFExplode")
+    }
+
+    sqlContext.dropTempTable("testUDF")
   }
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 1ff1d9a2934cc..8c3f9ac202637 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -26,9 +26,7 @@ import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
@@ -65,12 +63,12 @@ class MyDialect extends DefaultParserDialect
  * Hive to generate them (in contrast to HiveQuerySuite).  Often this is because the query is
  * valid, but Hive currently cannot execute it.
  */
-class SQLQuerySuite extends QueryTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import hiveContext._
+  import hiveContext.implicits._
 
   test("UDTF") {
-    sql(s"ADD JAR ${TestHive.getHiveFile("TestUDTF.jar").getCanonicalPath()}")
+    sql(s"ADD JAR ${hiveContext.getHiveFile("TestUDTF.jar").getCanonicalPath()}")
     // The function source code can be found at:
     // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
     sql(
@@ -265,7 +263,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
     def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
       val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
       relation match {
-        case LogicalRelation(r: ParquetRelation) =>
+        case LogicalRelation(r: ParquetRelation, _) =>
           if (!isDataSourceParquet) {
             fail(
               s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
@@ -509,19 +507,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
     checkAnswer(
       sql("SELECT f1.f2.f3 FROM nested"),
       Row(1))
-    checkAnswer(sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested"),
-      Seq.empty[Row])
+
+    sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested")
     checkAnswer(
       sql("SELECT * FROM test_ctas_1234"),
       sql("SELECT * FROM nested").collect().toSeq)
 
     intercept[AnalysisException] {
-      sql("CREATE TABLE test_ctas_12345 AS SELECT * from notexists").collect()
+      sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect()
     }
   }
 
   test("test CTAS") {
-    checkAnswer(sql("CREATE TABLE test_ctas_123 AS SELECT key, value FROM src"), Seq.empty[Row])
+    sql("CREATE TABLE test_ctas_123 AS SELECT key, value FROM src")
     checkAnswer(
       sql("SELECT key, value FROM test_ctas_123 ORDER BY key"),
       sql("SELECT key, value FROM src ORDER BY key").collect().toSeq)
@@ -614,7 +612,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
 
     val rowRdd = sparkContext.parallelize(row :: Nil)
 
-    TestHive.createDataFrame(rowRdd, schema).registerTempTable("testTable")
+    hiveContext.createDataFrame(rowRdd, schema).registerTempTable("testTable")
 
     sql(
       """CREATE TABLE nullValuesInInnerComplexTypes
@@ -1044,10 +1042,10 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
     val thread = new Thread {
       override def run() {
         // To make sure this test works, this jar should not be loaded in another place.
-        TestHive.sql(
-          s"ADD JAR ${TestHive.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath()}")
+        sql(
+          s"ADD JAR ${hiveContext.getHiveFile("hive-contrib-0.13.1.jar").getCanonicalPath()}")
         try {
-          TestHive.sql(
+          sql(
             """
               |CREATE TEMPORARY FUNCTION example_max
               |AS 'org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax'
@@ -1097,21 +1095,21 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
 
   test("SPARK-8588 HiveTypeCoercion.inConversion fires too early") {
     val df =
-      TestHive.createDataFrame(Seq((1, "2014-01-01"), (2, "2015-01-01"), (3, "2016-01-01")))
+      createDataFrame(Seq((1, "2014-01-01"), (2, "2015-01-01"), (3, "2016-01-01")))
     df.toDF("id", "datef").registerTempTable("test_SPARK8588")
     checkAnswer(
-      TestHive.sql(
+      sql(
         """
           |select id, concat(year(datef))
           |from test_SPARK8588 where concat(year(datef), ' year') in ('2015 year', '2014 year')
         """.stripMargin),
       Row(1, "2014") :: Row(2, "2015") :: Nil
     )
-    TestHive.dropTempTable("test_SPARK8588")
+    dropTempTable("test_SPARK8588")
   }
 
   test("SPARK-9371: fix the support for special chars in column names for hive context") {
-    TestHive.read.json(TestHive.sparkContext.makeRDD(
+    read.json(sparkContext.makeRDD(
       """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
       .registerTempTable("t")
 
@@ -1142,8 +1140,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
   test("specifying database name for a temporary table is not allowed") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
-      val df =
-        sqlContext.sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
+      val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str")
       df
         .write
         .format("parquet")
@@ -1173,4 +1170,82 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
       checkAnswer(sqlContext.table("`db.t`"), df)
     }
   }
+
+  test("SPARK-10593 same column names in lateral view") {
+    val df = sqlContext.sql(
+    """
+      |select
+      |insideLayer2.json as a2
+      |from (select '{"layer1": {"layer2": "text inside layer 2"}}' json) test
+      |lateral view json_tuple(json, 'layer1') insideLayer1 as json
+      |lateral view json_tuple(insideLayer1.json, 'layer2') insideLayer2 as json
+    """.stripMargin
+    )
+
+    checkAnswer(df, Row("text inside layer 2") :: Nil)
+  }
+
+  test("SPARK-10310: " +
+    "script transformation using default input/output SerDe and record reader/writer") {
+    sqlContext
+      .range(5)
+      .selectExpr("id AS a", "id AS b")
+      .registerTempTable("test")
+
+    checkAnswer(
+      sql(
+        """FROM(
+          |  FROM test SELECT TRANSFORM(a, b)
+          |  USING 'python src/test/resources/data/scripts/test_transform.py "\t"'
+          |  AS (c STRING, d STRING)
+          |) t
+          |SELECT c
+        """.stripMargin),
+      (0 until 5).map(i => Row(i + "#")))
+  }
+
+  test("SPARK-10310: script transformation using LazySimpleSerDe") {
+    sqlContext
+      .range(5)
+      .selectExpr("id AS a", "id AS b")
+      .registerTempTable("test")
+
+    val df = sql(
+      """FROM test
+        |SELECT TRANSFORM(a, b)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |WITH SERDEPROPERTIES('field.delim' = '|')
+        |USING 'python src/test/resources/data/scripts/test_transform.py "|"'
+        |AS (c STRING, d STRING)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |WITH SERDEPROPERTIES('field.delim' = '|')
+      """.stripMargin)
+
+    checkAnswer(df, (0 until 5).map(i => Row(i + "#", i + "#")))
+  }
+
+  test("SPARK-10741: Sort on Aggregate using parquet") {
+    withTable("test10741") {
+      withTempTable("src") {
+        Seq("a" -> 5, "a" -> 9, "b" -> 6).toDF().registerTempTable("src")
+        sql("CREATE TABLE test10741(c1 STRING, c2 INT) STORED AS PARQUET AS SELECT * FROM src")
+      }
+
+      checkAnswer(sql(
+        """
+          |SELECT c1, AVG(c2) AS c_avg
+          |FROM test10741
+          |GROUP BY c1
+          |HAVING (AVG(c2) > 5) ORDER BY c1
+        """.stripMargin), Row("a", 7.0) :: Row("b", 6.0) :: Nil)
+
+      checkAnswer(sql(
+        """
+          |SELECT c1, AVG(c2) AS c_avg
+          |FROM test10741
+          |GROUP BY c1
+          |ORDER BY AVG(c2)
+        """.stripMargin), Row("b", 6.0) :: Row("a", 7.0) :: Nil)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
index 9aca40f15ac15..7cfdb886b585d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -22,17 +22,14 @@ import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.execution.{UnaryNode, SparkPlan, SparkPlanTest}
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.types.StringType
 
-class ScriptTransformationSuite extends SparkPlanTest {
-
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton {
+  import hiveContext.implicits._
 
   private val noSerdeIOSchema = HiveScriptIOSchema(
     inputRowFormat = Seq.empty,
@@ -41,6 +38,8 @@ class ScriptTransformationSuite extends SparkPlanTest {
     outputSerdeClass = None,
     inputSerdeProps = Seq.empty,
     outputSerdeProps = Seq.empty,
+    recordReaderClass = None,
+    recordWriterClass = None,
     schemaLess = false
   )
 
@@ -59,7 +58,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
         output = Seq(AttributeReference("a", StringType)()),
         child = child,
         ioschema = noSerdeIOSchema
-      )(TestHive),
+      )(hiveContext),
       rowsDf.collect())
   }
 
@@ -73,7 +72,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
         output = Seq(AttributeReference("a", StringType)()),
         child = child,
         ioschema = serdeIOSchema
-      )(TestHive),
+      )(hiveContext),
       rowsDf.collect())
   }
 
@@ -88,7 +87,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
           output = Seq(AttributeReference("a", StringType)()),
           child = ExceptionInjectingOperator(child),
           ioschema = noSerdeIOSchema
-        )(TestHive),
+        )(hiveContext),
         rowsDf.collect())
     }
     assert(e.getMessage().contains("intentional exception"))
@@ -105,7 +104,7 @@ class ScriptTransformationSuite extends SparkPlanTest {
           output = Seq(AttributeReference("a", StringType)()),
           child = ExceptionInjectingOperator(child),
           ioschema = serdeIOSchema
-        )(TestHive),
+        )(hiveContext),
         rowsDf.collect())
     }
     assert(e.getMessage().contains("intentional exception"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index deec0048d24b8..92043d66c914f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -24,10 +24,17 @@ import org.apache.spark.sql.sources.HadoopFsRelationTest
 import org.apache.spark.sql.types._
 
 class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
+  import testImplicits._
+
   override val dataSourceName: String = classOf[DefaultSource].getCanonicalName
 
-  import sqlContext._
-  import sqlContext.implicits._
+  // ORC does not play well with NullType and UDT.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: NullType => false
+    case _: CalendarIntervalType => false
+    case _: UserDefinedType[_] => false
+    case _ => true
+  }
 
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
@@ -48,7 +55,7 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.options(Map(
+        hiveContext.read.options(Map(
           "path" -> file.getCanonicalPath,
           "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load())
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
index a46ca9a2c9706..52e09f9496f05 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -18,19 +18,17 @@
 package org.apache.spark.sql.hive.orc
 
 import java.io.File
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.util.Utils
-import org.scalatest.BeforeAndAfterAll
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.scalatest.BeforeAndAfterAll
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.util.Utils
+
 // The data where the partitioning key exists only in the directory structure.
 case class OrcParData(intField: Int, stringField: String)
 
@@ -38,7 +36,10 @@ case class OrcParData(intField: Int, stringField: String)
 case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
 
 // TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot
-class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
+class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext._
+  import hiveContext.implicits._
+
   val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultStrVal
 
   def withTempDir(f: File => Unit): Unit = {
@@ -58,7 +59,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
   }
 
   protected def withTempTable(tableName: String)(f: => Unit): Unit = {
-    try f finally TestHive.dropTempTable(tableName)
+    try f finally hiveContext.dropTempTable(tableName)
   }
 
   protected def makePartitionDir(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 744d462938141..5eb39b1129701 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -287,6 +287,20 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("SPARK-9170: Don't implicitly lowercase of user-provided columns") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      sqlContext.range(0, 10).select('id as "Acol").write.format("orc").save(path)
+      sqlContext.read.format("orc").load(path).schema("Acol")
+      intercept[IllegalArgumentException] {
+        sqlContext.read.format("orc").load(path).schema("acol")
+      }
+      checkAnswer(sqlContext.read.format("orc").load(path).select("acol").sort("acol"),
+        (0 until 10).map(Row(_)))
+    }
+  }
+
   test("SPARK-8501: Avoids discovery schema from empty ORC files") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
@@ -330,4 +344,34 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       }
     }
   }
+
+  test("SPARK-10623 Enable ORC PPD") {
+    withTempPath { dir =>
+      withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+        import testImplicits._
+
+        val path = dir.getCanonicalPath
+        sqlContext.range(10).coalesce(1).write.orc(path)
+        val df = sqlContext.read.orc(path)
+
+        def checkPredicate(pred: Column, answer: Seq[Long]): Unit = {
+          checkAnswer(df.where(pred), answer.map(Row(_)))
+        }
+
+        checkPredicate('id === 5, Seq(5L))
+        checkPredicate('id <=> 5, Seq(5L))
+        checkPredicate('id < 5, 0L to 4L)
+        checkPredicate('id <= 5, 0L to 5L)
+        checkPredicate('id > 5, 6L to 9L)
+        checkPredicate('id >= 5, 5L to 9L)
+        checkPredicate('id.isNull, Seq.empty[Long])
+        checkPredicate('id.isNotNull, 0L to 9L)
+        checkPredicate('id.isin(1L, 3L, 5L), Seq(1L, 3L, 5L))
+        checkPredicate('id > 0 && 'id < 3, 1L to 2L)
+        checkPredicate('id < 1 || 'id > 8, Seq(0L, 9L))
+        checkPredicate(!('id > 3), 0L to 3L)
+        checkPredicate(!('id > 0 && 'id < 3), Seq(0L) ++ (3L to 9L))
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 80c38084f293d..7a34cf731b4c5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -21,12 +21,14 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 case class OrcData(intField: Int, stringField: String)
 
-abstract class OrcSuite extends QueryTest with BeforeAndAfterAll {
+abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
+  import hiveContext._
+
   var orcTableDir: File = null
   var orcTableAsDir: File = null
 
@@ -156,7 +158,7 @@ class OrcSourceSuite extends OrcSuite {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
-    sql(
+    hiveContext.sql(
       s"""CREATE TEMPORARY TABLE normal_orc_source
          |USING org.apache.spark.sql.hive.orc
          |OPTIONS (
@@ -164,7 +166,7 @@ class OrcSourceSuite extends OrcSuite {
          |)
        """.stripMargin)
 
-    sql(
+    hiveContext.sql(
       s"""CREATE TEMPORARY TABLE normal_orc_as_source
          |USING org.apache.spark.sql.hive.orc
          |OPTIONS (
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
index f7ba20ff41d8d..88a0ed511749f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
@@ -22,15 +22,12 @@ import java.io.File
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 
-private[sql] trait OrcTest extends SQLTestUtils { this: SparkFunSuite =>
-  protected override def _sqlContext: SQLContext = org.apache.spark.sql.hive.test.TestHive
-  protected val sqlContext = _sqlContext
-  import sqlContext.implicits._
-  import sqlContext.sparkContext
+private[sql] trait OrcTest extends SQLTestUtils with TestHiveSingleton {
+  import testImplicits._
 
   /**
    * Writes `data` to a Orc file, which is then passed to `f` and will be deleted after `f`
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 34d3434569f58..7d8104f9358d5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -19,15 +19,11 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
-import org.scalatest.BeforeAndAfterAll
-
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.hive.execution.HiveTableScan
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
@@ -58,6 +54,8 @@ case class ParquetDataWithKeyAndComplexTypes(
  * built in parquet support.
  */
 class ParquetMetastoreSuite extends ParquetPartitioningTest {
+  import hiveContext._
+
   override def beforeAll(): Unit = {
     super.beforeAll()
     dropTables("partitioned_parquet",
@@ -284,7 +282,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       )
 
       table("test_parquet_ctas").queryExecution.optimizedPlan match {
-        case LogicalRelation(_: ParquetRelation) => // OK
+        case LogicalRelation(_: ParquetRelation, _) => // OK
         case _ => fail(
           "test_parquet_ctas should be converted to " +
               s"${classOf[ParquetRelation].getCanonicalName }")
@@ -371,7 +369,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       assertResult(2) {
         analyzed.collect {
-          case r@LogicalRelation(_: ParquetRelation) => r
+          case r @ LogicalRelation(_: ParquetRelation, _) => r
         }.size
       }
     }
@@ -380,7 +378,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
   def collectParquetRelation(df: DataFrame): ParquetRelation = {
     val plan = df.queryExecution.analyzed
     plan.collectFirst {
-      case LogicalRelation(r: ParquetRelation) => r
+      case LogicalRelation(r: ParquetRelation, _) => r
     }.getOrElse {
       fail(s"Expecting a ParquetRelation2, but got:\n$plan")
     }
@@ -430,7 +428,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       // Converted test_parquet should be cached.
       catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation) => // OK
+        case logical @ LogicalRelation(parquetRelation: ParquetRelation, _) => // OK
         case other =>
           fail(
             "The cached test_parquet should be a Parquet Relation. " +
@@ -536,6 +534,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
  * A suite of tests for the Parquet support through the data sources API.
  */
 class ParquetSourceSuite extends ParquetPartitioningTest {
+  import testImplicits._
+  import hiveContext._
+
   override def beforeAll(): Unit = {
     super.beforeAll()
     dropTables("partitioned_parquet",
@@ -684,9 +685,8 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
 /**
  * A collection of tests for parquet data with various forms of partitioning.
  */
-abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
-  override def _sqlContext: SQLContext = TestHive
-  protected val sqlContext = _sqlContext
+abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with TestHiveSingleton {
+  import testImplicits._
 
   var partitionedTableDir: File = null
   var normalTableDir: File = null
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
index b4640b1616281..dc0531a6d4bc5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
@@ -18,16 +18,13 @@
 package org.apache.spark.sql.sources
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 
 
-class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  private val sqlContext = _sqlContext
+class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton  {
 
   // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
   val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 8ca3a17085194..ef37787137d07 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,7 +28,13 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
-  import sqlContext._
+  // JSON does not write data of NullType and does not play well with BinaryType.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: NullType => false
+    case _: BinaryType => false
+    case _: CalendarIntervalType => false
+    case _ => true
+  }
 
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
@@ -47,7 +53,7 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.format(dataSourceName)
+        hiveContext.read.format(dataSourceName)
           .option("dataSchema", dataSchemaWithPartition.json)
           .load(file.getCanonicalPath))
     }
@@ -65,14 +71,14 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       val data =
         Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
           Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
-      val df = createDataFrame(sparkContext.parallelize(data), schema)
+      val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema)
 
       // Write the data out.
       df.write.format(dataSourceName).save(file.getCanonicalPath)
 
       // Read it back and check the result.
       checkAnswer(
-        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
+        hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
         df
       )
     }
@@ -90,14 +96,14 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
         Row(new BigDecimal("10.02")) ::
           Row(new BigDecimal("20000.99")) ::
           Row(new BigDecimal("10000")) :: Nil
-      val df = createDataFrame(sparkContext.parallelize(data), schema)
+      val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema)
 
       // Write the data out.
       df.write.format(dataSourceName).save(file.getCanonicalPath)
 
       // Read it back and check the result.
       checkAnswer(
-        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
+        hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
         df
       )
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
index cb4cedddbfddd..e2d754e806403 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -23,15 +23,21 @@ import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.{AnalysisException, SaveMode}
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.{execution, AnalysisException, SaveMode}
+import org.apache.spark.sql.types._
 
 
 class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
+  import testImplicits._
+
   override val dataSourceName: String = "parquet"
 
-  import sqlContext._
-  import sqlContext.implicits._
+  // Parquet does not play well with NullType.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: NullType => false
+    case _: CalendarIntervalType => false
+    case _ => true
+  }
 
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
@@ -51,7 +57,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.format(dataSourceName)
+        hiveContext.read.format(dataSourceName)
           .option("dataSchema", dataSchemaWithPartition.json)
           .load(file.getCanonicalPath))
     }
@@ -69,7 +75,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         .format("parquet")
         .save(s"${dir.getCanonicalPath}/_temporary")
 
-      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
+      checkAnswer(hiveContext.read.format("parquet").load(dir.getCanonicalPath), df.collect())
     }
   }
 
@@ -97,7 +103,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
 
       // This shouldn't throw anything.
       df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
-      checkAnswer(read.format("parquet").load(path), df)
+      checkAnswer(hiveContext.read.format("parquet").load(path), df)
     }
   }
 
@@ -107,7 +113,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
         // Parquet doesn't allow field names with spaces.  Here we are intentionally making an
         // exception thrown from the `ParquetRelation2.prepareForWriteJob()` method to trigger
         // the bug.  Please refer to spark-8079 for more details.
-        range(1, 10)
+        hiveContext.range(1, 10)
           .withColumnRenamed("id", "a b")
           .write
           .format("parquet")
@@ -125,7 +131,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       val summaryPath = new Path(path, "_metadata")
       val commonSummaryPath = new Path(path, "_common_metadata")
 
-      val fs = summaryPath.getFileSystem(configuration)
+      val fs = summaryPath.getFileSystem(hadoopConfiguration)
       fs.delete(summaryPath, true)
       fs.delete(commonSummaryPath, true)
 
@@ -136,4 +142,17 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       assert(fs.exists(commonSummaryPath))
     }
   }
+
+  test("SPARK-10334 Projections and filters should be kept in physical plan") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      sqlContext.range(2).select('id as 'a, 'id as 'b).write.partitionBy("b").parquet(path)
+      val df = sqlContext.read.parquet(path).filter('a === 0).select('b)
+      val physicalPlan = df.queryExecution.executedPlan
+
+      assert(physicalPlan.collect { case p: execution.Project => p }.length === 1)
+      assert(physicalPlan.collect { case p: execution.Filter => p }.length === 1)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
index e8975e5f5cd08..a3a124488d983 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -20,12 +20,27 @@ package org.apache.spark.sql.sources
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types._
 
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
 
-  import sqlContext._
+  // We have a very limited number of supported types at here since it is just for a
+  // test relation and we do very basic testing at here.
+  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
+    case _: BinaryType => false
+    // We are using random data generator and the generated strings are not really valid string.
+    case _: StringType => false
+    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
+    case _: CalendarIntervalType => false
+    case _: DateType => false
+    case _: TimestampType => false
+    case _: ArrayType => false
+    case _: MapType => false
+    case _: StructType => false
+    case _: UserDefinedType[_] => false
+    case _ => true
+  }
 
   test("save()/load() - partitioned table - simple queries - partition columns in data") {
     withTempDir { file =>
@@ -44,7 +59,7 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
         StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
 
       checkQueries(
-        read.format(dataSourceName)
+        hiveContext.read.format(dataSourceName)
           .option("dataSchema", dataSchemaWithPartition.json)
           .load(file.getCanonicalPath))
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index e8141923a9b5c..aeaaa3e1c5220 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputForma
 import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types.{DataType, StructType}
@@ -53,8 +54,10 @@ class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullW
   numberFormat.setGroupingUsed(false)
 
   override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-    val split = context.getTaskAttemptID.getTaskID.getId
+    val configuration = SparkHadoopUtil.get.getConfigurationFromJobContext(context)
+    val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID")
+    val taskAttemptId = SparkHadoopUtil.get.getTaskAttemptIDFromTaskAttemptContext(context)
+    val split = taskAttemptId.getTaskID.getId
     val name = FileOutputFormat.getOutputName(context)
     new Path(outputFile, s"$name-${numberFormat.format(split)}-$uniqueWriteJobId")
   }
@@ -65,7 +68,9 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext) extends
     new AppendingTextOutputFormat(new Path(path)).getRecordWriter(context)
 
   override def write(row: Row): Unit = {
-    val serialized = row.toSeq.map(_.toString).mkString(",")
+    val serialized = row.toSeq.map { v =>
+      if (v == null) "" else v.toString
+    }.mkString(",")
     recordWriter.write(null, new Text(serialized))
   }
 
@@ -109,7 +114,8 @@ class SimpleTextRelation(
     val fields = dataSchema.map(_.dataType)
 
     sparkContext.textFile(inputStatuses.map(_.getPath).mkString(",")).map { record =>
-      Row(record.split(",").zip(fields).map { case (value, dataType) =>
+      Row(record.split(",", -1).zip(fields).map { case (v, dataType) =>
+        val value = if (v == "") null else v
         // `Cast`ed values are always of Catalyst types (i.e. UTF8String instead of String, etc.)
         val catalystValue = Cast(Literal(value), dataType).eval()
         // Here we're converting Catalyst values to Scala values to test `needsConversion`
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 7966b43596e75..42b9b3d6340d8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -28,18 +28,18 @@ import org.apache.parquet.hadoop.ParquetOutputCommitter
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
 
-abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
-  override def _sqlContext: SQLContext = TestHive
-  protected val sqlContext = _sqlContext
+abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with TestHiveSingleton {
   import sqlContext.implicits._
 
   val dataSourceName: String
 
+  protected def supportsDataType(dataType: DataType): Boolean = true
+
   val dataSchema =
     StructType(
       Seq(
@@ -100,6 +100,60 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
     }
   }
 
+  private val supportedDataTypes = Seq(
+    StringType, BinaryType,
+    NullType, BooleanType,
+    ByteType, ShortType, IntegerType, LongType,
+    FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
+    DateType, TimestampType,
+    ArrayType(IntegerType),
+    MapType(StringType, LongType),
+    new StructType()
+      .add("f1", FloatType, nullable = true)
+      .add("f2", ArrayType(BooleanType, containsNull = true), nullable = true),
+    new MyDenseVectorUDT()
+  ).filter(supportsDataType)
+
+  for (dataType <- supportedDataTypes) {
+    test(s"test all data types - $dataType") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+
+        val dataGenerator = RandomDataGenerator.forType(
+          dataType = dataType,
+          nullable = true,
+          seed = Some(System.nanoTime())
+        ).getOrElse {
+          fail(s"Failed to create data generator for schema $dataType")
+        }
+
+        // Create a DF for the schema with random data. The index field is used to sort the
+        // DataFrame.  This is a workaround for SPARK-10591.
+        val schema = new StructType()
+          .add("index", IntegerType, nullable = false)
+          .add("col", dataType, nullable = true)
+        val rdd = sqlContext.sparkContext.parallelize((1 to 10).map(i => Row(i, dataGenerator())))
+        val df = sqlContext.createDataFrame(rdd, schema).orderBy("index").coalesce(1)
+
+        df.write
+          .mode("overwrite")
+          .format(dataSourceName)
+          .option("dataSchema", df.schema.json)
+          .save(path)
+
+        val loadedDF = sqlContext
+          .read
+          .format(dataSourceName)
+          .option("dataSchema", df.schema.json)
+          .schema(df.schema)
+          .load(path)
+          .orderBy("index")
+
+        checkAnswer(loadedDF, df)
+      }
+    }
+  }
+
   test("save()/load() - non-partitioned table - Overwrite") {
     withTempPath { file =>
       testDF.write.mode(SaveMode.Overwrite).format(dataSourceName).save(file.getCanonicalPath)
@@ -298,6 +352,19 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("saveAsTable()/load() - partitioned table - boolean type") {
+    sqlContext.range(2)
+      .select('id, ('id % 2 === 0).as("b"))
+      .write.partitionBy("b").saveAsTable("t")
+
+    withTable("t") {
+      checkAnswer(
+        sqlContext.table("t").sort('id),
+        Row(0, true) :: Row(1, false) :: Nil
+      )
+    }
+  }
+
   test("saveAsTable()/load() - partitioned table - Overwrite") {
     partitionedTestDF.write
       .format(dataSourceName)
@@ -432,7 +499,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       }
 
       val actualPaths = df.queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: HadoopFsRelation) =>
+        case LogicalRelation(relation: HadoopFsRelation, _) =>
           relation.paths.toSet
       }.getOrElse {
         fail("Expect an FSBasedRelation, but none could be found")
@@ -504,17 +571,17 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-8578 specified custom output committer will not be used to append data") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
     try {
       val df = sqlContext.range(1, 10).toDF("i")
       withTempPath { dir =>
         df.write.mode("append").format(dataSourceName).save(dir.getCanonicalPath)
-        configuration.set(
+        hadoopConfiguration.set(
           SQLConf.OUTPUT_COMMITTER_CLASS.key,
           classOf[AlwaysFailOutputCommitter].getName)
         // Since Parquet has its own output committer setting, also set it
         // to AlwaysFailParquetOutputCommitter at here.
-        configuration.set("spark.sql.parquet.output.committer.class",
+        hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
           classOf[AlwaysFailParquetOutputCommitter].getName)
         // Because there data already exists,
         // this append should succeed because we will use the output committer associated
@@ -533,12 +600,12 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         }
       }
       withTempPath { dir =>
-        configuration.set(
+        hadoopConfiguration.set(
           SQLConf.OUTPUT_COMMITTER_CLASS.key,
           classOf[AlwaysFailOutputCommitter].getName)
         // Since Parquet has its own output committer setting, also set it
         // to AlwaysFailParquetOutputCommitter at here.
-        configuration.set("spark.sql.parquet.output.committer.class",
+        hadoopConfiguration.set("spark.sql.parquet.output.committer.class",
           classOf[AlwaysFailParquetOutputCommitter].getName)
         // Because there is no existing data,
         // this append will fail because AlwaysFailOutputCommitter is used when we do append
@@ -549,8 +616,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
 
@@ -570,7 +637,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-9899 Disable customized output committer when speculation is on") {
-    val clonedConf = new Configuration(configuration)
+    val clonedConf = new Configuration(hadoopConfiguration)
     val speculationEnabled =
       sqlContext.sparkContext.conf.getBoolean("spark.speculation", defaultValue = false)
 
@@ -580,7 +647,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
         sqlContext.sparkContext.conf.set("spark.speculation", "true")
 
         // Uses a customized output committer which always fails
-        configuration.set(
+        hadoopConfiguration.set(
           SQLConf.OUTPUT_COMMITTER_CLASS.key,
           classOf[AlwaysFailOutputCommitter].getName)
 
@@ -597,8 +664,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
       }
     } finally {
       // Hadoop 1 doesn't have `Configuration.unset`
-      configuration.clear()
-      clonedConf.asScala.foreach(entry => configuration.set(entry.getKey, entry.getValue))
+      hadoopConfiguration.clear()
+      clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
       sqlContext.sparkContext.conf.set("spark.speculation", speculationEnabled.toString)
     }
   }
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 697895e72fe5b..5cc9001b0e9ab 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContextState.java b/streaming/src/main/java/org/apache/spark/streaming/StreamingContextState.java
similarity index 100%
rename from streaming/src/main/scala/org/apache/spark/streaming/StreamingContextState.java
rename to streaming/src/main/java/org/apache/spark/streaming/StreamingContextState.java
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index cd5d960369c05..8a6050f5227bf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -32,7 +32,7 @@ import org.apache.spark.streaming.scheduler.JobGenerator
 
 
 private[streaming]
-class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
+class Checkpoint(ssc: StreamingContext, val checkpointTime: Time)
   extends Logging with Serializable {
   val master = ssc.sc.master
   val framework = ssc.sc.appName
@@ -49,6 +49,8 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
     // Reload properties for the checkpoint application since user wants to set a reload property
     // or spark had changed its value and user wants to set it back.
     val propertiesToReload = List(
+      "spark.yarn.app.id",
+      "spark.yarn.app.attemptId",
       "spark.driver.host",
       "spark.driver.port",
       "spark.master",
@@ -319,7 +321,7 @@ object CheckpointReader extends Logging {
 
     // Try to read the checkpoint files in the order
     logInfo("Checkpoint files found: " + checkpointFiles.mkString(","))
-    val compressionCodec = CompressionCodec.createCodec(conf)
+    var readError: Exception = null
     checkpointFiles.foreach(file => {
       logInfo("Attempting to load checkpoint from file " + file)
       try {
@@ -330,13 +332,15 @@ object CheckpointReader extends Logging {
         return Some(cp)
       } catch {
         case e: Exception =>
+          readError = e
           logWarning("Error reading checkpoint from file " + file, e)
       }
     })
 
     // If none of checkpoint files could be read, then throw exception
     if (!ignoreReadError) {
-      throw new SparkException(s"Failed to read checkpoint from directory $checkpointPath")
+      throw new SparkException(
+        s"Failed to read checkpoint from directory $checkpointPath", readError)
     }
     None
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index b496d1f341a0b..94fea63f55b25 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver}
 import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener}
 import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab}
-import org.apache.spark.util.{CallSite, ShutdownHookManager, Utils}
+import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils}
 
 /**
  * Main entry point for Spark Streaming functionality. It provides methods used to create
@@ -200,6 +200,8 @@ class StreamingContext private[streaming] (
 
   private val startSite = new AtomicReference[CallSite](null)
 
+  private[streaming] def getStartSite(): CallSite = startSite.get()
+
   private var shutdownHookRef: AnyRef = _
 
   conf.getOption("spark.streaming.checkpoint.directory").foreach(checkpoint)
@@ -588,12 +590,20 @@ class StreamingContext private[streaming] (
     state match {
       case INITIALIZED =>
         startSite.set(DStream.getCreationSite())
-        sparkContext.setCallSite(startSite.get)
         StreamingContext.ACTIVATION_LOCK.synchronized {
           StreamingContext.assertNoOtherContextIsActive()
           try {
             validate()
-            scheduler.start()
+
+            // Start the streaming scheduler in a new thread, so that thread local properties
+            // like call sites and job groups can be reset without affecting those of the
+            // current thread.
+            ThreadUtils.runInNewThread("streaming-start") {
+              sparkContext.setCallSite(startSite.get)
+              sparkContext.clearJobGroup()
+              sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
+              scheduler.start()
+            }
             state = StreamingContextState.ACTIVE
           } catch {
             case NonFatal(e) =>
@@ -618,6 +628,7 @@ class StreamingContext private[streaming] (
     }
   }
 
+
   /**
    * Wait for the execution to stop. Any exceptions that occurs during the execution
    * will be thrown in this thread.
@@ -735,7 +746,7 @@ object StreamingContext extends Logging {
         throw new IllegalStateException(
           "Only one StreamingContext may be started in this JVM. " +
             "Currently running StreamingContext was started at" +
-            activeContext.get.startSite.get.longForm)
+            activeContext.get.getStartSite().longForm)
       }
     }
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
index 2c373640d2fd9..dfc569451df86 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -170,7 +170,7 @@ private[python] object PythonDStream {
  */
 private[python] abstract class PythonDStream(
     parent: DStream[_],
-    @transient pfunc: PythonTransformFunction)
+    pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new TransformFunction(pfunc)
@@ -187,7 +187,7 @@ private[python] abstract class PythonDStream(
  */
 private[python] class PythonTransformedDStream (
     parent: DStream[_],
-    @transient pfunc: PythonTransformFunction)
+    pfunc: PythonTransformFunction)
   extends PythonDStream(parent, pfunc) {
 
   override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
@@ -206,7 +206,7 @@ private[python] class PythonTransformedDStream (
 private[python] class PythonTransformed2DStream(
     parent: DStream[_],
     parent2: DStream[_],
-    @transient pfunc: PythonTransformFunction)
+    pfunc: PythonTransformFunction)
   extends DStream[Array[Byte]] (parent.ssc) {
 
   val func = new TransformFunction(pfunc)
@@ -230,7 +230,7 @@ private[python] class PythonTransformed2DStream(
  */
 private[python] class PythonStateDStream(
     parent: DStream[Array[Byte]],
-    @transient reduceFunc: PythonTransformFunction)
+    reduceFunc: PythonTransformFunction)
   extends PythonDStream(parent, reduceFunc) {
 
   super.persist(StorageLevel.MEMORY_ONLY)
@@ -252,8 +252,8 @@ private[python] class PythonStateDStream(
  */
 private[python] class PythonReducedWindowedDStream(
     parent: DStream[Array[Byte]],
-    @transient preduceFunc: PythonTransformFunction,
-    @transient pinvReduceFunc: PythonTransformFunction,
+    preduceFunc: PythonTransformFunction,
+    @transient private val pinvReduceFunc: PythonTransformFunction,
     _windowDuration: Duration,
     _slideDuration: Duration)
   extends PythonDStream(parent, preduceFunc) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index c358f5b5bd70b..40208a64861fb 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -70,7 +70,7 @@ import org.apache.spark.util.{SerializableConfiguration, TimeStampedHashMap, Uti
  */
 private[streaming]
 class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     directory: String,
     filter: Path => Boolean = FileInputDStream.defaultFilter,
     newFilesOnly: Boolean = true,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
index a6c4cd220e42f..95994c983c0cc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
@@ -39,7 +39,7 @@ import org.apache.spark.util.Utils
  *
  * @param ssc_ Streaming context that will execute this input stream
  */
-abstract class InputDStream[T: ClassTag] (@transient ssc_ : StreamingContext)
+abstract class InputDStream[T: ClassTag] (ssc_ : StreamingContext)
   extends DStream[T](ssc_) {
 
   private[streaming] var lastValidTime: Time = null
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala
index 186e1bf03a944..002aac9f43617 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PluggableInputDStream.scala
@@ -23,7 +23,7 @@ import org.apache.spark.streaming.receiver.Receiver
 
 private[streaming]
 class PluggableInputDStream[T: ClassTag](
-  @transient ssc_ : StreamingContext,
+  ssc_ : StreamingContext,
   receiver: Receiver[T]) extends ReceiverInputDStream[T](ssc_) {
 
   def getReceiver(): Receiver[T] = {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index a2f5d82a79bd3..a2685046e03d4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming.dstream
 
-import java.io.{NotSerializableException, ObjectOutputStream}
+import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}
 
 import scala.collection.mutable.{ArrayBuffer, Queue}
 import scala.reflect.ClassTag
@@ -27,7 +27,7 @@ import org.apache.spark.streaming.{Time, StreamingContext}
 
 private[streaming]
 class QueueInputDStream[T: ClassTag](
-    @transient ssc: StreamingContext,
+    ssc: StreamingContext,
     val queue: Queue[RDD[T]],
     oneAtATime: Boolean,
     defaultRDD: RDD[T]
@@ -37,8 +37,13 @@ class QueueInputDStream[T: ClassTag](
 
   override def stop() { }
 
+  private def readObject(in: ObjectInputStream): Unit = {
+    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
+      "Please don't use queueStream when checkpointing is enabled.")
+  }
+
   private def writeObject(oos: ObjectOutputStream): Unit = {
-    throw new NotSerializableException("queueStream doesn't support checkpointing")
+    logWarning("queueStream doesn't support checkpointing")
   }
 
   override def compute(validTime: Time): Option[RDD[T]] = {
@@ -52,7 +57,7 @@ class QueueInputDStream[T: ClassTag](
       if (oneAtATime) {
         Some(buffer.head)
       } else {
-        Some(new UnionRDD(ssc.sc, buffer.toSeq))
+        Some(new UnionRDD(context.sc, buffer.toSeq))
       }
     } else if (defaultRDD != null) {
       Some(defaultRDD)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
index e2925b9e03ec3..5a9eda7c12776 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
@@ -39,7 +39,7 @@ import org.apache.spark.streaming.receiver.Receiver
  */
 private[streaming]
 class RawInputDStream[T: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     host: String,
     port: Int,
     storageLevel: StorageLevel
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index 6c139f32da31d..87c20afd5c13c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.streaming.{StreamingContext, Time}
  * @param ssc_ Streaming context that will execute this input stream
  * @tparam T Class type of the object of this stream
  */
-abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingContext)
+abstract class ReceiverInputDStream[T: ClassTag](ssc_ : StreamingContext)
   extends InputDStream[T](ssc_) {
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
index 5ce5b7aae6e69..de84e0c9a498d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala
@@ -32,7 +32,7 @@ import org.apache.spark.streaming.receiver.Receiver
 
 private[streaming]
 class SocketInputDStream[T: ClassTag](
-    @transient ssc_ : StreamingContext,
+    ssc_ : StreamingContext,
     host: String,
     port: Int,
     bytesToObjects: InputStream => Iterator[T],
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index e081ffe46f502..f811784b25c82 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -61,7 +61,7 @@ class WriteAheadLogBackedBlockRDDPartition(
  *
  *
  * @param sc SparkContext
- * @param blockIds Ids of the blocks that contains this RDD's data
+ * @param _blockIds Ids of the blocks that contains this RDD's data
  * @param walRecordHandles Record handles in write ahead logs that contain this RDD's data
  * @param isBlockIdValid Whether the block Ids are valid (i.e., the blocks are present in the Spark
  *                         executors). If not, then block lookups by the block ids will be skipped.
@@ -73,23 +73,23 @@ class WriteAheadLogBackedBlockRDDPartition(
  */
 private[streaming]
 class WriteAheadLogBackedBlockRDD[T: ClassTag](
-    @transient sc: SparkContext,
-    @transient blockIds: Array[BlockId],
+    sc: SparkContext,
+    @transient private val _blockIds: Array[BlockId],
     @transient val walRecordHandles: Array[WriteAheadLogRecordHandle],
-    @transient isBlockIdValid: Array[Boolean] = Array.empty,
+    @transient private val isBlockIdValid: Array[Boolean] = Array.empty,
     storeInBlockManager: Boolean = false,
     storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER)
-  extends BlockRDD[T](sc, blockIds) {
+  extends BlockRDD[T](sc, _blockIds) {
 
   require(
-    blockIds.length == walRecordHandles.length,
-    s"Number of block Ids (${blockIds.length}) must be " +
+    _blockIds.length == walRecordHandles.length,
+    s"Number of block Ids (${_blockIds.length}) must be " +
       s" same as number of WAL record handles (${walRecordHandles.length})")
 
   require(
-    isBlockIdValid.isEmpty || isBlockIdValid.length == blockIds.length,
+    isBlockIdValid.isEmpty || isBlockIdValid.length == _blockIds.length,
     s"Number of elements in isBlockIdValid (${isBlockIdValid.length}) must be " +
-      s" same as number of block Ids (${blockIds.length})")
+      s" same as number of block Ids (${_blockIds.length})")
 
   // Hadoop configuration is not serializable, so broadcast it as a serializable.
   @transient private val hadoopConfig = sc.hadoopConfiguration
@@ -99,9 +99,9 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
 
   override def getPartitions: Array[Partition] = {
     assertValid()
-    Array.tabulate(blockIds.length) { i =>
+    Array.tabulate(_blockIds.length) { i =>
       val isValid = if (isBlockIdValid.length == 0) true else isBlockIdValid(i)
-      new WriteAheadLogBackedBlockRDDPartition(i, blockIds(i), isValid, walRecordHandles(i))
+      new WriteAheadLogBackedBlockRDDPartition(i, _blockIds(i), isValid, walRecordHandles(i))
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
index 9922b6bc1201b..3c869561ef8b9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
@@ -39,6 +39,8 @@ case class BatchInfo(
     processingEndTime: Option[Long]
   ) {
 
+  private var _failureReasons: Map[Int, String] = Map.empty
+
   @deprecated("Use streamIdToInputInfo instead", "1.5.0")
   def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords)
 
@@ -67,4 +69,12 @@ case class BatchInfo(
    * The number of recorders received by the receivers in this batch.
    */
   def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
+
+  /** Set the failure reasons corresponding to every output ops in the batch */
+  private[streaming] def setFailureReason(reasons: Map[Int, String]): Unit = {
+    _failureReasons = reasons
+  }
+
+  /** Failure reasons corresponding to every output ops in the batch */
+  private[streaming] def failureReasons = _failureReasons
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 0cd39594ee923..66afbf1b11764 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -25,6 +25,7 @@ import scala.util.{Failure, Success}
 import org.apache.spark.Logging
 import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.streaming._
+import org.apache.spark.streaming.ui.UIUtils
 import org.apache.spark.util.{EventLoop, ThreadUtils}
 
 
@@ -165,22 +166,22 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   }
 
   private def handleJobCompletion(job: Job) {
+    val jobSet = jobSets.get(job.time)
+    jobSet.handleJobCompletion(job)
+    logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
+    if (jobSet.hasCompleted) {
+      jobSets.remove(jobSet.time)
+      jobGenerator.onBatchCompletion(jobSet.time)
+      logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
+        jobSet.totalDelay / 1000.0, jobSet.time.toString,
+        jobSet.processingDelay / 1000.0
+      ))
+      listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
+    }
     job.result match {
-      case Success(_) =>
-        val jobSet = jobSets.get(job.time)
-        jobSet.handleJobCompletion(job)
-        logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
-        if (jobSet.hasCompleted) {
-          jobSets.remove(jobSet.time)
-          jobGenerator.onBatchCompletion(jobSet.time)
-          logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
-            jobSet.totalDelay / 1000.0, jobSet.time.toString,
-            jobSet.processingDelay / 1000.0
-          ))
-          listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
-        }
       case Failure(e) =>
         reportError("Error running job " + job, e)
+      case _ =>
     }
   }
 
@@ -190,10 +191,20 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   }
 
   private class JobHandler(job: Job) extends Runnable with Logging {
+    import JobScheduler._
+
     def run() {
-      ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
-      ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
       try {
+        val formattedTime = UIUtils.formatBatchTime(
+          job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
+        val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
+        val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"
+
+        ssc.sc.setJobDescription(
+          s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
+        ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
+        ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
+
         // We need to assign `eventLoop` to a temp variable. Otherwise, because
         // `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
         // it's possible that when `post` is called, `eventLoop` happens to null.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index 95833efc9417f..255ccf0536688 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -18,8 +18,10 @@
 package org.apache.spark.streaming.scheduler
 
 import scala.collection.mutable.HashSet
+import scala.util.Failure
 
 import org.apache.spark.streaming.Time
+import org.apache.spark.util.Utils
 
 /** Class representing a set of Jobs
   * belong to the same batch.
@@ -62,12 +64,23 @@ case class JobSet(
   }
 
   def toBatchInfo: BatchInfo = {
-    new BatchInfo(
+    val failureReasons: Map[Int, String] = {
+      if (hasCompleted) {
+        jobs.filter(_.result.isFailure).map { job =>
+          (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception))
+        }.toMap
+      } else {
+        Map.empty
+      }
+    }
+    val binfo = new BatchInfo(
       time,
       streamIdToInputInfo,
       submissionTime,
-      if (processingStartTime >= 0 ) Some(processingStartTime) else None,
-      if (processingEndTime >= 0 ) Some(processingEndTime) else None
+      if (processingStartTime >= 0) Some(processingStartTime) else None,
+      if (processingEndTime >= 0) Some(processingEndTime) else None
     )
+    binfo.setFailureReason(failureReasons)
+    binfo
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 3d532a675db02..204e6142fd6cf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc._
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.receiver._
-import org.apache.spark.util.{ThreadUtils, SerializableConfiguration}
+import org.apache.spark.util.{Utils, ThreadUtils, SerializableConfiguration}
 
 
 /** Enumeration to identify current state of a Receiver */
@@ -291,7 +291,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         ReceiverTrackingInfo(
           streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo))
     }
-    receiverTrackingInfos -= streamId
+    receiverTrackingInfos(streamId) = newReceiverTrackingInfo
     listenerBus.post(StreamingListenerReceiverStopped(newReceiverTrackingInfo.toReceiverInfo))
     val messageWithError = if (error != null && !error.isEmpty) {
       s"$message - $error"
@@ -483,7 +483,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         context.reply(true)
       // Local messages
       case AllReceiverIds =>
-        context.reply(receiverTrackingInfos.keys.toSeq)
+        context.reply(receiverTrackingInfos.filter(_._2.state != ReceiverState.INACTIVE).keys.toSeq)
       case StopAllReceivers =>
         assert(isTrackerStopping || isTrackerStopped)
         stopReceivers()
@@ -554,6 +554,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
           ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors))
         }
       receiverRDD.setName(s"Receiver $receiverId")
+      ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId")
+      ssc.sparkContext.setCallSite(Option(ssc.getStartSite()).getOrElse(Utils.getCallSite()))
+
       val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit](
         receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ())
       // We will keep restarting the receiver job until ReceiverTracker is stopped
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 90d1b0fadecfc..9129c1f26abd4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -19,14 +19,14 @@ package org.apache.spark.streaming.ui
 
 import javax.servlet.http.HttpServletRequest
 
-import scala.xml.{NodeSeq, Node, Text, Unparsed}
+import scala.xml._
 
 import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.streaming.Time
-import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage}
-import org.apache.spark.streaming.ui.StreamingJobProgressListener.{SparkJobId, OutputOpId}
+import org.apache.spark.streaming.ui.StreamingJobProgressListener.{OutputOpId, SparkJobId}
 import org.apache.spark.ui.jobs.UIData.JobUIData
+import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage}
 
 private[ui] case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData])
 
@@ -207,16 +207,25 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
             sparkListener.stageIdToInfo.get(sparkJob.stageIds.max)
           }
         }
-    val lastStageData = lastStageInfo.flatMap { s =>
-      sparkListener.stageIdToData.get((s.stageId, s.attemptId))
-    }
-
-    val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)")
-    val lastStageDescription = lastStageData.flatMap(_.description).getOrElse("")
+    lastStageInfo match {
+      case Some(stageInfo) =>
+        val details = if (stageInfo.details.nonEmpty) {
+          <span
+            onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
+            class="expand-details">
+              +details
+          </span> ++
+          <div class="stage-details collapsed">
+            <pre>{stageInfo.details}</pre>
+          </div>
+        } else {
+          NodeSeq.Empty
+        }
 
-    <span class="description-input" title={lastStageDescription}>
-      {lastStageDescription}
-    </span> ++ Text(lastStageName)
+        <div> {stageInfo.name} {details} </div>
+      case None =>
+        Text("(Unknown)")
+    }
   }
 
   private def failureReasonCell(failureReason: String): Seq[Node] = {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
index dd32ad5ad811d..0148cb51c6f09 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
@@ -72,8 +72,10 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
 
   /**
    * Stop the timer, and return the last time the callback was made.
-   * interruptTimer = true will interrupt the callback
+   * - interruptTimer = true will interrupt the callback
    * if it is in progress (not guaranteed to give correct time in this case).
+   * - interruptTimer = false guarantees that there will be at least one callback after `stop` has
+   * been called.
    */
   def stop(interruptTimer: Boolean): Long = synchronized {
     if (!stopped) {
@@ -87,18 +89,23 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
     prevTime
   }
 
+  private def triggerActionForNextInterval(): Unit = {
+    clock.waitTillTime(nextTime)
+    callback(nextTime)
+    prevTime = nextTime
+    nextTime += period
+    logDebug("Callback for " + name + " called at time " + prevTime)
+  }
+
   /**
    * Repeatedly call the callback every interval.
    */
   private def loop() {
     try {
       while (!stopped) {
-        clock.waitTillTime(nextTime)
-        callback(nextTime)
-        prevTime = nextTime
-        nextTime += period
-        logDebug("Callback for " + name + " called at time " + prevTime)
+        triggerActionForNextInterval()
       }
+      triggerActionForNextInterval()
     } catch {
       case e: InterruptedException =>
     }
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
index e0718f73aa13f..c5217149224e4 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
@@ -18,24 +18,22 @@
 package org.apache.spark.streaming;
 
 import java.io.*;
-import java.lang.Iterable;
 import java.nio.charset.Charset;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 
+import scala.Tuple2;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 
-import scala.Tuple2;
-
 import org.junit.Assert;
-import static org.junit.Assert.*;
 import org.junit.Test;
 
 import com.google.common.base.Optional;
-import com.google.common.collect.Lists;
 import com.google.common.io.Files;
 import com.google.common.collect.Sets;
 
@@ -54,14 +52,14 @@
 // see http://stackoverflow.com/questions/758570/.
 public class JavaAPISuite extends LocalJavaStreamingContext implements Serializable {
 
-  public void equalIterator(Iterator<?> a, Iterator<?> b) {
+  public static void equalIterator(Iterator<?> a, Iterator<?> b) {
     while (a.hasNext() && b.hasNext()) {
       Assert.assertEquals(a.next(), b.next());
     }
     Assert.assertEquals(a.hasNext(), b.hasNext());
   }
 
-  public void equalIterable(Iterable<?> a, Iterable<?> b) {
+  public static void equalIterable(Iterable<?> a, Iterable<?> b) {
       equalIterator(a.iterator(), b.iterator());
   }
 
@@ -74,14 +72,14 @@ public void testInitialization() {
   @Test
   public void testContextState() {
     List<List<Integer>> inputData = Arrays.asList(Arrays.asList(1, 2, 3, 4));
-    Assert.assertTrue(ssc.getState() == StreamingContextState.INITIALIZED);
+    Assert.assertEquals(StreamingContextState.INITIALIZED, ssc.getState());
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaTestUtils.attachTestOutputStream(stream);
-    Assert.assertTrue(ssc.getState() == StreamingContextState.INITIALIZED);
+    Assert.assertEquals(StreamingContextState.INITIALIZED, ssc.getState());
     ssc.start();
-    Assert.assertTrue(ssc.getState() == StreamingContextState.ACTIVE);
+    Assert.assertEquals(StreamingContextState.ACTIVE, ssc.getState());
     ssc.stop();
-    Assert.assertTrue(ssc.getState() == StreamingContextState.STOPPED);
+    Assert.assertEquals(StreamingContextState.STOPPED, ssc.getState());
   }
 
   @SuppressWarnings("unchecked")
@@ -118,7 +116,7 @@ public void testMap() {
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<Integer> letterCount = stream.map(new Function<String, Integer>() {
         @Override
-        public Integer call(String s) throws Exception {
+        public Integer call(String s) {
           return s.length();
         }
     });
@@ -180,7 +178,7 @@ public void testWindowWithSlideDuration() {
   public void testFilter() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<String>> expected = Arrays.asList(
         Arrays.asList("giants"),
@@ -189,7 +187,7 @@ public void testFilter() {
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<String> filtered = stream.filter(new Function<String, Boolean>() {
       @Override
-      public Boolean call(String s) throws Exception {
+      public Boolean call(String s) {
         return s.contains("a");
       }
     });
@@ -243,11 +241,11 @@ public void testRepartitionFewerPartitions() {
   public void testGlom() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<List<String>>> expected = Arrays.asList(
         Arrays.asList(Arrays.asList("giants", "dodgers")),
-        Arrays.asList(Arrays.asList("yankees", "red socks")));
+        Arrays.asList(Arrays.asList("yankees", "red sox")));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<List<String>> glommed = stream.glom();
@@ -262,22 +260,22 @@ public void testGlom() {
   public void testMapPartitions() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<String>> expected = Arrays.asList(
         Arrays.asList("GIANTSDODGERS"),
-        Arrays.asList("YANKEESRED SOCKS"));
+        Arrays.asList("YANKEESRED SOX"));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<String> mapped = stream.mapPartitions(
         new FlatMapFunction<Iterator<String>, String>() {
           @Override
           public Iterable<String> call(Iterator<String> in) {
-            String out = "";
+            StringBuilder out = new StringBuilder();
             while (in.hasNext()) {
-              out = out + in.next().toUpperCase();
+              out.append(in.next().toUpperCase(Locale.ENGLISH));
             }
-            return Lists.newArrayList(out);
+            return Arrays.asList(out.toString());
           }
         });
     JavaTestUtils.attachTestOutputStream(mapped);
@@ -286,16 +284,16 @@ public Iterable<String> call(Iterator<String> in) {
     Assert.assertEquals(expected, result);
   }
 
-  private class IntegerSum implements Function2<Integer, Integer, Integer> {
+  private static class IntegerSum implements Function2<Integer, Integer, Integer> {
     @Override
-    public Integer call(Integer i1, Integer i2) throws Exception {
+    public Integer call(Integer i1, Integer i2) {
       return i1 + i2;
     }
   }
 
-  private class IntegerDifference implements Function2<Integer, Integer, Integer> {
+  private static class IntegerDifference implements Function2<Integer, Integer, Integer> {
     @Override
-    public Integer call(Integer i1, Integer i2) throws Exception {
+    public Integer call(Integer i1, Integer i2) {
       return i1 - i2;
     }
   }
@@ -347,13 +345,13 @@ private void testReduceByWindow(boolean withInverse) {
         Arrays.asList(24));
 
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
-    JavaDStream<Integer> reducedWindowed = null;
+    JavaDStream<Integer> reducedWindowed;
     if (withInverse) {
       reducedWindowed = stream.reduceByWindow(new IntegerSum(),
-        new IntegerDifference(), new Duration(2000), new Duration(1000));
+                                              new IntegerDifference(), new Duration(2000), new Duration(1000));
     } else {
       reducedWindowed = stream.reduceByWindow(new IntegerSum(),
-        new Duration(2000), new Duration(1000));
+                                              new Duration(2000), new Duration(1000));
     }
     JavaTestUtils.attachTestOutputStream(reducedWindowed);
     List<List<Integer>> result = JavaTestUtils.runStreams(ssc, 4, 4);
@@ -378,11 +376,11 @@ public void testQueueStream() {
         Arrays.asList(7,8,9));
 
     JavaSparkContext jsc = new JavaSparkContext(ssc.ssc().sc());
-    JavaRDD<Integer> rdd1 = ssc.sparkContext().parallelize(Arrays.asList(1, 2, 3));
-    JavaRDD<Integer> rdd2 = ssc.sparkContext().parallelize(Arrays.asList(4, 5, 6));
-    JavaRDD<Integer> rdd3 = ssc.sparkContext().parallelize(Arrays.asList(7,8,9));
+    JavaRDD<Integer> rdd1 = jsc.parallelize(Arrays.asList(1, 2, 3));
+    JavaRDD<Integer> rdd2 = jsc.parallelize(Arrays.asList(4, 5, 6));
+    JavaRDD<Integer> rdd3 = jsc.parallelize(Arrays.asList(7,8,9));
 
-    LinkedList<JavaRDD<Integer>> rdds = Lists.newLinkedList();
+    Queue<JavaRDD<Integer>> rdds = new LinkedList<>();
     rdds.add(rdd1);
     rdds.add(rdd2);
     rdds.add(rdd3);
@@ -410,10 +408,10 @@ public void testTransform() {
     JavaDStream<Integer> transformed = stream.transform(
       new Function<JavaRDD<Integer>, JavaRDD<Integer>>() {
         @Override
-        public JavaRDD<Integer> call(JavaRDD<Integer> in) throws Exception {
+        public JavaRDD<Integer> call(JavaRDD<Integer> in) {
           return in.map(new Function<Integer, Integer>() {
             @Override
-            public Integer call(Integer i) throws Exception {
+            public Integer call(Integer i) {
               return i + 2;
             }
           });
@@ -435,70 +433,70 @@ public void testVariousTransform() {
     JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
 
     List<List<Tuple2<String, Integer>>> pairInputData =
-        Arrays.asList(Arrays.asList(new Tuple2<String, Integer>("x", 1)));
+        Arrays.asList(Arrays.asList(new Tuple2<>("x", 1)));
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(
         JavaTestUtils.attachTestInputStream(ssc, pairInputData, 1));
 
-    JavaDStream<Integer> transformed1 = stream.transform(
+    stream.transform(
         new Function<JavaRDD<Integer>, JavaRDD<Integer>>() {
           @Override
-          public JavaRDD<Integer> call(JavaRDD<Integer> in) throws Exception {
+          public JavaRDD<Integer> call(JavaRDD<Integer> in) {
             return null;
           }
         }
     );
 
-    JavaDStream<Integer> transformed2 = stream.transform(
+    stream.transform(
       new Function2<JavaRDD<Integer>, Time, JavaRDD<Integer>>() {
-        @Override public JavaRDD<Integer> call(JavaRDD<Integer> in, Time time) throws Exception {
+        @Override public JavaRDD<Integer> call(JavaRDD<Integer> in, Time time) {
           return null;
         }
       }
     );
 
-    JavaPairDStream<String, Integer> transformed3 = stream.transformToPair(
+    stream.transformToPair(
         new Function<JavaRDD<Integer>, JavaPairRDD<String, Integer>>() {
-          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in) throws Exception {
+          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<String, Integer> transformed4 = stream.transformToPair(
+    stream.transformToPair(
         new Function2<JavaRDD<Integer>, Time, JavaPairRDD<String, Integer>>() {
-          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in, Time time) throws Exception {
+          @Override public JavaPairRDD<String, Integer> call(JavaRDD<Integer> in, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Integer> pairTransformed1 = pairStream.transform(
+    pairStream.transform(
         new Function<JavaPairRDD<String, Integer>, JavaRDD<Integer>>() {
-          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in) throws Exception {
+          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in) {
             return null;
           }
         }
     );
 
-    JavaDStream<Integer> pairTransformed2 = pairStream.transform(
+    pairStream.transform(
         new Function2<JavaPairRDD<String, Integer>, Time, JavaRDD<Integer>>() {
-          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in, Time time) throws Exception {
+          @Override public JavaRDD<Integer> call(JavaPairRDD<String, Integer> in, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<String, String> pairTransformed3 = pairStream.transformToPair(
+    pairStream.transformToPair(
         new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, String>>() {
-          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in) throws Exception {
+          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<String, String> pairTransformed4 = pairStream.transformToPair(
+    pairStream.transformToPair(
         new Function2<JavaPairRDD<String, Integer>, Time, JavaPairRDD<String, String>>() {
-          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in, Time time) throws Exception {
+          @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, Integer> in, Time time) {
             return null;
           }
         }
@@ -511,32 +509,32 @@ public JavaRDD<Integer> call(JavaRDD<Integer> in) throws Exception {
   public void testTransformWith() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
+            new Tuple2<>("california", "dodgers"),
+            new Tuple2<>("new york", "yankees")),
         Arrays.asList(
-            new Tuple2<String, String>("california", "sharks"),
-            new Tuple2<String, String>("new york", "rangers")));
+            new Tuple2<>("california", "sharks"),
+            new Tuple2<>("new york", "rangers")));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, String>("california", "giants"),
-            new Tuple2<String, String>("new york", "mets")),
+            new Tuple2<>("california", "giants"),
+            new Tuple2<>("new york", "mets")),
         Arrays.asList(
-            new Tuple2<String, String>("california", "ducks"),
-            new Tuple2<String, String>("new york", "islanders")));
+            new Tuple2<>("california", "ducks"),
+            new Tuple2<>("new york", "islanders")));
 
 
     List<HashSet<Tuple2<String, Tuple2<String, String>>>> expected = Arrays.asList(
         Sets.newHashSet(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("dodgers", "giants")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("yankees", "mets"))),
+            new Tuple2<>("california",
+                         new Tuple2<>("dodgers", "giants")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("yankees", "mets"))),
         Sets.newHashSet(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("sharks", "ducks")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("rangers", "islanders"))));
+            new Tuple2<>("california",
+                         new Tuple2<>("sharks", "ducks")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("rangers", "islanders"))));
 
     JavaDStream<Tuple2<String, String>> stream1 = JavaTestUtils.attachTestInputStream(
         ssc, stringStringKVStream1, 1);
@@ -552,14 +550,12 @@ public void testTransformWith() {
             JavaPairRDD<String, String>,
             JavaPairRDD<String, String>,
             Time,
-            JavaPairRDD<String, Tuple2<String, String>>
-          >() {
+            JavaPairRDD<String, Tuple2<String, String>>>() {
           @Override
           public JavaPairRDD<String, Tuple2<String, String>> call(
               JavaPairRDD<String, String> rdd1,
               JavaPairRDD<String, String> rdd2,
-              Time time
-          ) throws Exception {
+              Time time) {
             return rdd1.join(rdd2);
           }
         }
@@ -567,9 +563,9 @@ public JavaPairRDD<String, Tuple2<String, String>> call(
 
     JavaTestUtils.attachTestOutputStream(joined);
     List<List<Tuple2<String, Tuple2<String, String>>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
-    List<HashSet<Tuple2<String, Tuple2<String, String>>>> unorderedResult = Lists.newArrayList();
+    List<HashSet<Tuple2<String, Tuple2<String, String>>>> unorderedResult = new ArrayList<>();
     for (List<Tuple2<String, Tuple2<String, String>>> res: result) {
-        unorderedResult.add(Sets.newHashSet(res));
+      unorderedResult.add(Sets.newHashSet(res));
     }
 
     Assert.assertEquals(expected, unorderedResult);
@@ -587,89 +583,89 @@ public void testVariousTransformWith() {
     JavaDStream<String> stream2 = JavaTestUtils.attachTestInputStream(ssc, inputData2, 1);
 
     List<List<Tuple2<String, Integer>>> pairInputData1 =
-        Arrays.asList(Arrays.asList(new Tuple2<String, Integer>("x", 1)));
+        Arrays.asList(Arrays.asList(new Tuple2<>("x", 1)));
     List<List<Tuple2<Double, Character>>> pairInputData2 =
-        Arrays.asList(Arrays.asList(new Tuple2<Double, Character>(1.0, 'x')));
+        Arrays.asList(Arrays.asList(new Tuple2<>(1.0, 'x')));
     JavaPairDStream<String, Integer> pairStream1 = JavaPairDStream.fromJavaDStream(
         JavaTestUtils.attachTestInputStream(ssc, pairInputData1, 1));
     JavaPairDStream<Double, Character> pairStream2 = JavaPairDStream.fromJavaDStream(
         JavaTestUtils.attachTestInputStream(ssc, pairInputData2, 1));
 
-    JavaDStream<Double> transformed1 = stream1.transformWith(
+    stream1.transformWith(
         stream2,
         new Function3<JavaRDD<Integer>, JavaRDD<String>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Double> transformed2 = stream1.transformWith(
+    stream1.transformWith(
         pairStream1,
         new Function3<JavaRDD<Integer>, JavaPairRDD<String, Integer>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> transformed3 = stream1.transformWithToPair(
+    stream1.transformWithToPair(
         stream2,
         new Function3<JavaRDD<Integer>, JavaRDD<String>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> transformed4 = stream1.transformWithToPair(
+    stream1.transformWithToPair(
         pairStream1,
         new Function3<JavaRDD<Integer>, JavaPairRDD<String, Integer>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaRDD<Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Double> pairTransformed1 = pairStream1.transformWith(
+    pairStream1.transformWith(
         stream2,
         new Function3<JavaPairRDD<String, Integer>, JavaRDD<String>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaDStream<Double> pairTransformed2_ = pairStream1.transformWith(
+    pairStream1.transformWith(
         pairStream1,
         new Function3<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>, Time, JavaRDD<Double>>() {
           @Override
-          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) throws Exception {
+          public JavaRDD<Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<String, Integer> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> pairTransformed3 = pairStream1.transformWithToPair(
+    pairStream1.transformWithToPair(
         stream2,
         new Function3<JavaPairRDD<String, Integer>, JavaRDD<String>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaRDD<String> rdd2, Time time) {
             return null;
           }
         }
     );
 
-    JavaPairDStream<Double, Double> pairTransformed4 = pairStream1.transformWithToPair(
+    pairStream1.transformWithToPair(
         pairStream2,
         new Function3<JavaPairRDD<String, Integer>, JavaPairRDD<Double, Character>, Time, JavaPairRDD<Double, Double>>() {
           @Override
-          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<Double, Character> rdd2, Time time) throws Exception {
+          public JavaPairRDD<Double, Double> call(JavaPairRDD<String, Integer> rdd1, JavaPairRDD<Double, Character> rdd2, Time time) {
             return null;
           }
         }
@@ -690,13 +686,13 @@ public void testStreamingContextTransform(){
     );
 
     List<List<Tuple2<Integer, String>>> pairStream1input = Arrays.asList(
-        Arrays.asList(new Tuple2<Integer, String>(1, "x")),
-        Arrays.asList(new Tuple2<Integer, String>(2, "y"))
+        Arrays.asList(new Tuple2<>(1, "x")),
+        Arrays.asList(new Tuple2<>(2, "y"))
     );
 
     List<List<Tuple2<Integer, Tuple2<Integer, String>>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<Integer, Tuple2<Integer, String>>(1, new Tuple2<Integer, String>(1, "x"))),
-        Arrays.asList(new Tuple2<Integer, Tuple2<Integer, String>>(2, new Tuple2<Integer, String>(2, "y")))
+        Arrays.asList(new Tuple2<>(1, new Tuple2<>(1, "x"))),
+        Arrays.asList(new Tuple2<>(2, new Tuple2<>(2, "y")))
     );
 
     JavaDStream<Integer> stream1 = JavaTestUtils.attachTestInputStream(ssc, stream1input, 1);
@@ -707,7 +703,7 @@ public void testStreamingContextTransform(){
     List<JavaDStream<?>> listOfDStreams1 = Arrays.<JavaDStream<?>>asList(stream1, stream2);
 
     // This is just to test whether this transform to JavaStream compiles
-    JavaDStream<Long> transformed1 = ssc.transform(
+    ssc.transform(
       listOfDStreams1,
       new Function2<List<JavaRDD<?>>, Time, JavaRDD<Long>>() {
         @Override
@@ -733,8 +729,8 @@ public JavaPairRDD<Integer, Tuple2<Integer, String>> call(List<JavaRDD<?>> listO
           JavaPairRDD<Integer, String> prdd3 = JavaPairRDD.fromJavaRDD(rdd3);
           PairFunction<Integer, Integer, Integer> mapToTuple = new PairFunction<Integer, Integer, Integer>() {
             @Override
-            public Tuple2<Integer, Integer> call(Integer i) throws Exception {
-              return new Tuple2<Integer, Integer>(i, i);
+            public Tuple2<Integer, Integer> call(Integer i) {
+              return new Tuple2<>(i, i);
             }
           };
           return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);
@@ -763,7 +759,7 @@ public void testFlatMap() {
     JavaDStream<String> flatMapped = stream.flatMap(new FlatMapFunction<String, String>() {
       @Override
       public Iterable<String> call(String x) {
-        return Lists.newArrayList(x.split("(?!^)"));
+        return Arrays.asList(x.split("(?!^)"));
       }
     });
     JavaTestUtils.attachTestOutputStream(flatMapped);
@@ -782,39 +778,39 @@ public void testPairFlatMap() {
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, String>(6, "g"),
-            new Tuple2<Integer, String>(6, "i"),
-            new Tuple2<Integer, String>(6, "a"),
-            new Tuple2<Integer, String>(6, "n"),
-            new Tuple2<Integer, String>(6, "t"),
-            new Tuple2<Integer, String>(6, "s")),
+            new Tuple2<>(6, "g"),
+            new Tuple2<>(6, "i"),
+            new Tuple2<>(6, "a"),
+            new Tuple2<>(6, "n"),
+            new Tuple2<>(6, "t"),
+            new Tuple2<>(6, "s")),
         Arrays.asList(
-            new Tuple2<Integer, String>(7, "d"),
-            new Tuple2<Integer, String>(7, "o"),
-            new Tuple2<Integer, String>(7, "d"),
-            new Tuple2<Integer, String>(7, "g"),
-            new Tuple2<Integer, String>(7, "e"),
-            new Tuple2<Integer, String>(7, "r"),
-            new Tuple2<Integer, String>(7, "s")),
+            new Tuple2<>(7, "d"),
+            new Tuple2<>(7, "o"),
+            new Tuple2<>(7, "d"),
+            new Tuple2<>(7, "g"),
+            new Tuple2<>(7, "e"),
+            new Tuple2<>(7, "r"),
+            new Tuple2<>(7, "s")),
         Arrays.asList(
-            new Tuple2<Integer, String>(9, "a"),
-            new Tuple2<Integer, String>(9, "t"),
-            new Tuple2<Integer, String>(9, "h"),
-            new Tuple2<Integer, String>(9, "l"),
-            new Tuple2<Integer, String>(9, "e"),
-            new Tuple2<Integer, String>(9, "t"),
-            new Tuple2<Integer, String>(9, "i"),
-            new Tuple2<Integer, String>(9, "c"),
-            new Tuple2<Integer, String>(9, "s")));
+            new Tuple2<>(9, "a"),
+            new Tuple2<>(9, "t"),
+            new Tuple2<>(9, "h"),
+            new Tuple2<>(9, "l"),
+            new Tuple2<>(9, "e"),
+            new Tuple2<>(9, "t"),
+            new Tuple2<>(9, "i"),
+            new Tuple2<>(9, "c"),
+            new Tuple2<>(9, "s")));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<Integer, String> flatMapped = stream.flatMapToPair(
       new PairFlatMapFunction<String, Integer, String>() {
         @Override
-        public Iterable<Tuple2<Integer, String>> call(String in) throws Exception {
-          List<Tuple2<Integer, String>> out = Lists.newArrayList();
+        public Iterable<Tuple2<Integer, String>> call(String in) {
+          List<Tuple2<Integer, String>> out = new ArrayList<>();
           for (String letter: in.split("(?!^)")) {
-            out.add(new Tuple2<Integer, String>(in.length(), letter));
+            out.add(new Tuple2<>(in.length(), letter));
           }
           return out;
         }
@@ -859,13 +855,13 @@ public void testUnion() {
    */
   public static <T> void assertOrderInvariantEquals(
       List<List<T>> expected, List<List<T>> actual) {
-    List<Set<T>> expectedSets = new ArrayList<Set<T>>();
+    List<Set<T>> expectedSets = new ArrayList<>();
     for (List<T> list: expected) {
-      expectedSets.add(Collections.unmodifiableSet(new HashSet<T>(list)));
+      expectedSets.add(Collections.unmodifiableSet(new HashSet<>(list)));
     }
-    List<Set<T>> actualSets = new ArrayList<Set<T>>();
+    List<Set<T>> actualSets = new ArrayList<>();
     for (List<T> list: actual) {
-      actualSets.add(Collections.unmodifiableSet(new HashSet<T>(list)));
+      actualSets.add(Collections.unmodifiableSet(new HashSet<>(list)));
     }
     Assert.assertEquals(expectedSets, actualSets);
   }
@@ -877,25 +873,25 @@ public static <T> void assertOrderInvariantEquals(
   public void testPairFilter() {
     List<List<String>> inputData = Arrays.asList(
         Arrays.asList("giants", "dodgers"),
-        Arrays.asList("yankees", "red socks"));
+        Arrays.asList("yankees", "red sox"));
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("giants", 6)),
-        Arrays.asList(new Tuple2<String, Integer>("yankees", 7)));
+        Arrays.asList(new Tuple2<>("giants", 6)),
+        Arrays.asList(new Tuple2<>("yankees", 7)));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = stream.mapToPair(
         new PairFunction<String, String, Integer>() {
           @Override
-          public Tuple2<String, Integer> call(String in) throws Exception {
-            return new Tuple2<String, Integer>(in, in.length());
+          public Tuple2<String, Integer> call(String in) {
+            return new Tuple2<>(in, in.length());
           }
         });
 
     JavaPairDStream<String, Integer> filtered = pairStream.filter(
         new Function<Tuple2<String, Integer>, Boolean>() {
       @Override
-      public Boolean call(Tuple2<String, Integer> in) throws Exception {
+      public Boolean call(Tuple2<String, Integer> in) {
         return in._1().contains("a");
       }
     });
@@ -906,28 +902,28 @@ public Boolean call(Tuple2<String, Integer> in) throws Exception {
   }
 
   @SuppressWarnings("unchecked")
-  private List<List<Tuple2<String, String>>> stringStringKVStream = Arrays.asList(
-      Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-          new Tuple2<String, String>("california", "giants"),
-          new Tuple2<String, String>("new york", "yankees"),
-          new Tuple2<String, String>("new york", "mets")),
-      Arrays.asList(new Tuple2<String, String>("california", "sharks"),
-          new Tuple2<String, String>("california", "ducks"),
-          new Tuple2<String, String>("new york", "rangers"),
-          new Tuple2<String, String>("new york", "islanders")));
+  private final List<List<Tuple2<String, String>>> stringStringKVStream = Arrays.asList(
+      Arrays.asList(new Tuple2<>("california", "dodgers"),
+                    new Tuple2<>("california", "giants"),
+                    new Tuple2<>("new york", "yankees"),
+                    new Tuple2<>("new york", "mets")),
+      Arrays.asList(new Tuple2<>("california", "sharks"),
+                    new Tuple2<>("california", "ducks"),
+                    new Tuple2<>("new york", "rangers"),
+                    new Tuple2<>("new york", "islanders")));
 
   @SuppressWarnings("unchecked")
-  private List<List<Tuple2<String, Integer>>> stringIntKVStream = Arrays.asList(
+  private final List<List<Tuple2<String, Integer>>> stringIntKVStream = Arrays.asList(
       Arrays.asList(
-          new Tuple2<String, Integer>("california", 1),
-          new Tuple2<String, Integer>("california", 3),
-          new Tuple2<String, Integer>("new york", 4),
-          new Tuple2<String, Integer>("new york", 1)),
+          new Tuple2<>("california", 1),
+          new Tuple2<>("california", 3),
+          new Tuple2<>("new york", 4),
+          new Tuple2<>("new york", 1)),
       Arrays.asList(
-          new Tuple2<String, Integer>("california", 5),
-          new Tuple2<String, Integer>("california", 5),
-          new Tuple2<String, Integer>("new york", 3),
-          new Tuple2<String, Integer>("new york", 1)));
+          new Tuple2<>("california", 5),
+          new Tuple2<>("california", 5),
+          new Tuple2<>("new york", 3),
+          new Tuple2<>("new york", 1)));
 
   @SuppressWarnings("unchecked")
   @Test
@@ -936,22 +932,22 @@ public void testPairMap() { // Maps pair -> pair of different type
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-                new Tuple2<Integer, String>(1, "california"),
-                new Tuple2<Integer, String>(3, "california"),
-                new Tuple2<Integer, String>(4, "new york"),
-                new Tuple2<Integer, String>(1, "new york")),
+            new Tuple2<>(1, "california"),
+            new Tuple2<>(3, "california"),
+            new Tuple2<>(4, "new york"),
+            new Tuple2<>(1, "new york")),
         Arrays.asList(
-                new Tuple2<Integer, String>(5, "california"),
-                new Tuple2<Integer, String>(5, "california"),
-                new Tuple2<Integer, String>(3, "new york"),
-                new Tuple2<Integer, String>(1, "new york")));
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(3, "new york"),
+            new Tuple2<>(1, "new york")));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
     JavaPairDStream<Integer, String> reversed = pairStream.mapToPair(
         new PairFunction<Tuple2<String, Integer>, Integer, String>() {
           @Override
-          public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception {
+          public Tuple2<Integer, String> call(Tuple2<String, Integer> in) {
             return in.swap();
           }
         });
@@ -969,23 +965,23 @@ public void testPairMapPartitions() { // Maps pair -> pair of different type
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, String>(1, "california"),
-            new Tuple2<Integer, String>(3, "california"),
-            new Tuple2<Integer, String>(4, "new york"),
-            new Tuple2<Integer, String>(1, "new york")),
+            new Tuple2<>(1, "california"),
+            new Tuple2<>(3, "california"),
+            new Tuple2<>(4, "new york"),
+            new Tuple2<>(1, "new york")),
         Arrays.asList(
-            new Tuple2<Integer, String>(5, "california"),
-            new Tuple2<Integer, String>(5, "california"),
-            new Tuple2<Integer, String>(3, "new york"),
-            new Tuple2<Integer, String>(1, "new york")));
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(5, "california"),
+            new Tuple2<>(3, "new york"),
+            new Tuple2<>(1, "new york")));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
     JavaPairDStream<Integer, String> reversed = pairStream.mapPartitionsToPair(
         new PairFlatMapFunction<Iterator<Tuple2<String, Integer>>, Integer, String>() {
           @Override
-          public Iterable<Tuple2<Integer, String>> call(Iterator<Tuple2<String, Integer>> in) throws Exception {
-            LinkedList<Tuple2<Integer, String>> out = new LinkedList<Tuple2<Integer, String>>();
+          public Iterable<Tuple2<Integer, String>> call(Iterator<Tuple2<String, Integer>> in) {
+            List<Tuple2<Integer, String>> out = new LinkedList<>();
             while (in.hasNext()) {
               Tuple2<String, Integer> next = in.next();
               out.add(next.swap());
@@ -1014,7 +1010,7 @@ public void testPairMap2() { // Maps pair -> single
     JavaDStream<Integer> reversed = pairStream.map(
         new Function<Tuple2<String, Integer>, Integer>() {
           @Override
-          public Integer call(Tuple2<String, Integer> in) throws Exception {
+          public Integer call(Tuple2<String, Integer> in) {
             return in._2();
           }
         });
@@ -1030,23 +1026,23 @@ public Integer call(Tuple2<String, Integer> in) throws Exception {
   public void testPairToPairFlatMapWithChangingTypes() { // Maps pair -> pair
     List<List<Tuple2<String, Integer>>> inputData = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Integer>("hi", 1),
-            new Tuple2<String, Integer>("ho", 2)),
+            new Tuple2<>("hi", 1),
+            new Tuple2<>("ho", 2)),
         Arrays.asList(
-            new Tuple2<String, Integer>("hi", 1),
-            new Tuple2<String, Integer>("ho", 2)));
+            new Tuple2<>("hi", 1),
+            new Tuple2<>("ho", 2)));
 
     List<List<Tuple2<Integer, String>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, String>(1, "h"),
-            new Tuple2<Integer, String>(1, "i"),
-            new Tuple2<Integer, String>(2, "h"),
-            new Tuple2<Integer, String>(2, "o")),
+            new Tuple2<>(1, "h"),
+            new Tuple2<>(1, "i"),
+            new Tuple2<>(2, "h"),
+            new Tuple2<>(2, "o")),
         Arrays.asList(
-            new Tuple2<Integer, String>(1, "h"),
-            new Tuple2<Integer, String>(1, "i"),
-            new Tuple2<Integer, String>(2, "h"),
-            new Tuple2<Integer, String>(2, "o")));
+            new Tuple2<>(1, "h"),
+            new Tuple2<>(1, "i"),
+            new Tuple2<>(2, "h"),
+            new Tuple2<>(2, "o")));
 
     JavaDStream<Tuple2<String, Integer>> stream =
         JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
@@ -1054,10 +1050,10 @@ public void testPairToPairFlatMapWithChangingTypes() { // Maps pair -> pair
     JavaPairDStream<Integer, String> flatMapped = pairStream.flatMapToPair(
         new PairFlatMapFunction<Tuple2<String, Integer>, Integer, String>() {
           @Override
-          public Iterable<Tuple2<Integer, String>> call(Tuple2<String, Integer> in) throws Exception {
-            List<Tuple2<Integer, String>> out = new LinkedList<Tuple2<Integer, String>>();
+          public Iterable<Tuple2<Integer, String>> call(Tuple2<String, Integer> in) {
+            List<Tuple2<Integer, String>> out = new LinkedList<>();
             for (Character s : in._1().toCharArray()) {
-              out.add(new Tuple2<Integer, String>(in._2(), s.toString()));
+              out.add(new Tuple2<>(in._2(), s.toString()));
             }
             return out;
           }
@@ -1075,11 +1071,11 @@ public void testPairGroupByKey() {
 
     List<List<Tuple2<String, List<String>>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, List<String>>("california", Arrays.asList("dodgers", "giants")),
-            new Tuple2<String, List<String>>("new york", Arrays.asList("yankees", "mets"))),
+            new Tuple2<>("california", Arrays.asList("dodgers", "giants")),
+            new Tuple2<>("new york", Arrays.asList("yankees", "mets"))),
         Arrays.asList(
-            new Tuple2<String, List<String>>("california", Arrays.asList("sharks", "ducks")),
-            new Tuple2<String, List<String>>("new york", Arrays.asList("rangers", "islanders"))));
+            new Tuple2<>("california", Arrays.asList("sharks", "ducks")),
+            new Tuple2<>("new york", Arrays.asList("rangers", "islanders"))));
 
     JavaDStream<Tuple2<String, String>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, String> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1111,11 +1107,11 @@ public void testPairReduceByKey() {
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
+            new Tuple2<>("california", 4),
+            new Tuple2<>("new york", 5)),
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+            new Tuple2<>("california", 10),
+            new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1136,20 +1132,20 @@ public void testCombineByKey() {
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
+            new Tuple2<>("california", 4),
+            new Tuple2<>("new york", 5)),
         Arrays.asList(
-            new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+            new Tuple2<>("california", 10),
+            new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
 
-    JavaPairDStream<String, Integer> combined = pairStream.<Integer>combineByKey(
+    JavaPairDStream<String, Integer> combined = pairStream.combineByKey(
         new Function<Integer, Integer>() {
           @Override
-          public Integer call(Integer i) throws Exception {
+          public Integer call(Integer i) {
             return i;
           }
         }, new IntegerSum(), new IntegerSum(), new HashPartitioner(2));
@@ -1170,13 +1166,13 @@ public void testCountByValue() {
 
     List<List<Tuple2<String, Long>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Long>("hello", 1L),
-            new Tuple2<String, Long>("world", 1L)),
+            new Tuple2<>("hello", 1L),
+            new Tuple2<>("world", 1L)),
         Arrays.asList(
-            new Tuple2<String, Long>("hello", 1L),
-            new Tuple2<String, Long>("moon", 1L)),
+            new Tuple2<>("hello", 1L),
+            new Tuple2<>("moon", 1L)),
         Arrays.asList(
-            new Tuple2<String, Long>("hello", 1L)));
+            new Tuple2<>("hello", 1L)));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Long> counted = stream.countByValue();
@@ -1193,16 +1189,16 @@ public void testGroupByKeyAndWindow() {
 
     List<List<Tuple2<String, List<Integer>>>> expected = Arrays.asList(
       Arrays.asList(
-        new Tuple2<String, List<Integer>>("california", Arrays.asList(1, 3)),
-        new Tuple2<String, List<Integer>>("new york", Arrays.asList(1, 4))
+        new Tuple2<>("california", Arrays.asList(1, 3)),
+        new Tuple2<>("new york", Arrays.asList(1, 4))
       ),
       Arrays.asList(
-        new Tuple2<String, List<Integer>>("california", Arrays.asList(1, 3, 5, 5)),
-        new Tuple2<String, List<Integer>>("new york", Arrays.asList(1, 1, 3, 4))
+        new Tuple2<>("california", Arrays.asList(1, 3, 5, 5)),
+        new Tuple2<>("new york", Arrays.asList(1, 1, 3, 4))
       ),
       Arrays.asList(
-        new Tuple2<String, List<Integer>>("california", Arrays.asList(5, 5)),
-        new Tuple2<String, List<Integer>>("new york", Arrays.asList(1, 3))
+        new Tuple2<>("california", Arrays.asList(5, 5)),
+        new Tuple2<>("new york", Arrays.asList(1, 3))
       )
     );
 
@@ -1220,16 +1216,16 @@ public void testGroupByKeyAndWindow() {
     }
   }
 
-  private HashSet<Tuple2<String, HashSet<Integer>>> convert(List<Tuple2<String, List<Integer>>> listOfTuples) {
-    List<Tuple2<String, HashSet<Integer>>> newListOfTuples = new ArrayList<Tuple2<String, HashSet<Integer>>>();
+  private static Set<Tuple2<String, HashSet<Integer>>> convert(List<Tuple2<String, List<Integer>>> listOfTuples) {
+    List<Tuple2<String, HashSet<Integer>>> newListOfTuples = new ArrayList<>();
     for (Tuple2<String, List<Integer>> tuple: listOfTuples) {
       newListOfTuples.add(convert(tuple));
     }
-    return new HashSet<Tuple2<String, HashSet<Integer>>>(newListOfTuples);
+    return new HashSet<>(newListOfTuples);
   }
 
-  private Tuple2<String, HashSet<Integer>> convert(Tuple2<String, List<Integer>> tuple) {
-    return new Tuple2<String, HashSet<Integer>>(tuple._1(), new HashSet<Integer>(tuple._2()));
+  private static Tuple2<String, HashSet<Integer>> convert(Tuple2<String, List<Integer>> tuple) {
+    return new Tuple2<>(tuple._1(), new HashSet<>(tuple._2()));
   }
 
   @SuppressWarnings("unchecked")
@@ -1238,12 +1234,12 @@ public void testReduceByKeyAndWindow() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+        Arrays.asList(new Tuple2<>("california", 4),
+                      new Tuple2<>("new york", 5)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)),
+        Arrays.asList(new Tuple2<>("california", 10),
+                      new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1262,12 +1258,12 @@ public void testUpdateStateByKey() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)));
+        Arrays.asList(new Tuple2<>("california", 4),
+                      new Tuple2<>("new york", 5)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1278,10 +1274,10 @@ public void testUpdateStateByKey() {
           public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
             int out = 0;
             if (state.isPresent()) {
-              out = out + state.get();
+              out += state.get();
             }
             for (Integer v : values) {
-              out = out + v;
+              out += v;
             }
             return Optional.of(out);
           }
@@ -1298,19 +1294,19 @@ public void testUpdateStateByKeyWithInitial() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<Tuple2<String, Integer>> initial = Arrays.asList (
-            new Tuple2<String, Integer> ("california", 1),
-            new Tuple2<String, Integer> ("new york", 2));
+        new Tuple2<>("california", 1),
+            new Tuple2<>("new york", 2));
 
     JavaRDD<Tuple2<String, Integer>> tmpRDD = ssc.sparkContext().parallelize(initial);
     JavaPairRDD<String, Integer> initialRDD = JavaPairRDD.fromJavaRDD (tmpRDD);
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 5),
-            new Tuple2<String, Integer>("new york", 7)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 15),
-            new Tuple2<String, Integer>("new york", 11)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 15),
-            new Tuple2<String, Integer>("new york", 11)));
+        Arrays.asList(new Tuple2<>("california", 5),
+                      new Tuple2<>("new york", 7)),
+        Arrays.asList(new Tuple2<>("california", 15),
+                      new Tuple2<>("new york", 11)),
+        Arrays.asList(new Tuple2<>("california", 15),
+                      new Tuple2<>("new york", 11)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
@@ -1321,10 +1317,10 @@ public void testUpdateStateByKeyWithInitial() {
           public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
             int out = 0;
             if (state.isPresent()) {
-              out = out + state.get();
+              out += state.get();
             }
             for (Integer v : values) {
-              out = out + v;
+              out += v;
             }
             return Optional.of(out);
           }
@@ -1341,19 +1337,19 @@ public void testReduceByKeyAndWindowWithInverse() {
     List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream;
 
     List<List<Tuple2<String, Integer>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, Integer>("california", 4),
-            new Tuple2<String, Integer>("new york", 5)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 14),
-            new Tuple2<String, Integer>("new york", 9)),
-        Arrays.asList(new Tuple2<String, Integer>("california", 10),
-            new Tuple2<String, Integer>("new york", 4)));
+        Arrays.asList(new Tuple2<>("california", 4),
+                      new Tuple2<>("new york", 5)),
+        Arrays.asList(new Tuple2<>("california", 14),
+                      new Tuple2<>("new york", 9)),
+        Arrays.asList(new Tuple2<>("california", 10),
+                      new Tuple2<>("new york", 4)));
 
     JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream);
 
     JavaPairDStream<String, Integer> reduceWindowed =
         pairStream.reduceByKeyAndWindow(new IntegerSum(), new IntegerDifference(),
-            new Duration(2000), new Duration(1000));
+                                        new Duration(2000), new Duration(1000));
     JavaTestUtils.attachTestOutputStream(reduceWindowed);
     List<List<Tuple2<String, Integer>>> result = JavaTestUtils.runStreams(ssc, 3, 3);
 
@@ -1370,15 +1366,15 @@ public void testCountByValueAndWindow() {
 
     List<HashSet<Tuple2<String, Long>>> expected = Arrays.asList(
         Sets.newHashSet(
-            new Tuple2<String, Long>("hello", 1L),
-            new Tuple2<String, Long>("world", 1L)),
+            new Tuple2<>("hello", 1L),
+            new Tuple2<>("world", 1L)),
         Sets.newHashSet(
-            new Tuple2<String, Long>("hello", 2L),
-            new Tuple2<String, Long>("world", 1L),
-            new Tuple2<String, Long>("moon", 1L)),
+            new Tuple2<>("hello", 2L),
+            new Tuple2<>("world", 1L),
+            new Tuple2<>("moon", 1L)),
         Sets.newHashSet(
-            new Tuple2<String, Long>("hello", 2L),
-            new Tuple2<String, Long>("moon", 1L)));
+            new Tuple2<>("hello", 2L),
+            new Tuple2<>("moon", 1L)));
 
     JavaDStream<String> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1386,7 +1382,7 @@ public void testCountByValueAndWindow() {
       stream.countByValueAndWindow(new Duration(2000), new Duration(1000));
     JavaTestUtils.attachTestOutputStream(counted);
     List<List<Tuple2<String, Long>>> result = JavaTestUtils.runStreams(ssc, 3, 3);
-    List<HashSet<Tuple2<String, Long>>> unorderedResult = Lists.newArrayList();
+    List<Set<Tuple2<String, Long>>> unorderedResult = new ArrayList<>();
     for (List<Tuple2<String, Long>> res: result) {
       unorderedResult.add(Sets.newHashSet(res));
     }
@@ -1399,27 +1395,27 @@ public void testCountByValueAndWindow() {
   public void testPairTransform() {
     List<List<Tuple2<Integer, Integer>>> inputData = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(2, 5)),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(1, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(2, 5)),
         Arrays.asList(
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(1, 5)));
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(1, 5)));
 
     List<List<Tuple2<Integer, Integer>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5)),
+            new Tuple2<>(1, 5),
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5)),
         Arrays.asList(
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5)));
+            new Tuple2<>(1, 5),
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5)));
 
     JavaDStream<Tuple2<Integer, Integer>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1428,7 +1424,7 @@ public void testPairTransform() {
     JavaPairDStream<Integer, Integer> sorted = pairStream.transformToPair(
         new Function<JavaPairRDD<Integer, Integer>, JavaPairRDD<Integer, Integer>>() {
           @Override
-          public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) throws Exception {
+          public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) {
             return in.sortByKey();
           }
         });
@@ -1444,15 +1440,15 @@ public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) thro
   public void testPairToNormalRDDTransform() {
     List<List<Tuple2<Integer, Integer>>> inputData = Arrays.asList(
         Arrays.asList(
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(1, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(2, 5)),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(1, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(2, 5)),
         Arrays.asList(
-            new Tuple2<Integer, Integer>(2, 5),
-            new Tuple2<Integer, Integer>(3, 5),
-            new Tuple2<Integer, Integer>(4, 5),
-            new Tuple2<Integer, Integer>(1, 5)));
+            new Tuple2<>(2, 5),
+            new Tuple2<>(3, 5),
+            new Tuple2<>(4, 5),
+            new Tuple2<>(1, 5)));
 
     List<List<Integer>> expected = Arrays.asList(
         Arrays.asList(3,1,4,2),
@@ -1465,11 +1461,11 @@ public void testPairToNormalRDDTransform() {
     JavaDStream<Integer> firstParts = pairStream.transform(
         new Function<JavaPairRDD<Integer, Integer>, JavaRDD<Integer>>() {
           @Override
-          public JavaRDD<Integer> call(JavaPairRDD<Integer, Integer> in) throws Exception {
+          public JavaRDD<Integer> call(JavaPairRDD<Integer, Integer> in) {
             return in.map(new Function<Tuple2<Integer, Integer>, Integer>() {
               @Override
-              public Integer call(Tuple2<Integer, Integer> in) {
-                return in._1();
+              public Integer call(Tuple2<Integer, Integer> in2) {
+                return in2._1();
               }
             });
           }
@@ -1487,14 +1483,14 @@ public void testMapValues() {
     List<List<Tuple2<String, String>>> inputData = stringStringKVStream;
 
     List<List<Tuple2<String, String>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "DODGERS"),
-            new Tuple2<String, String>("california", "GIANTS"),
-            new Tuple2<String, String>("new york", "YANKEES"),
-            new Tuple2<String, String>("new york", "METS")),
-        Arrays.asList(new Tuple2<String, String>("california", "SHARKS"),
-            new Tuple2<String, String>("california", "DUCKS"),
-            new Tuple2<String, String>("new york", "RANGERS"),
-            new Tuple2<String, String>("new york", "ISLANDERS")));
+        Arrays.asList(new Tuple2<>("california", "DODGERS"),
+                      new Tuple2<>("california", "GIANTS"),
+                      new Tuple2<>("new york", "YANKEES"),
+                      new Tuple2<>("new york", "METS")),
+        Arrays.asList(new Tuple2<>("california", "SHARKS"),
+                      new Tuple2<>("california", "DUCKS"),
+                      new Tuple2<>("new york", "RANGERS"),
+                      new Tuple2<>("new york", "ISLANDERS")));
 
     JavaDStream<Tuple2<String, String>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1502,8 +1498,8 @@ public void testMapValues() {
 
     JavaPairDStream<String, String> mapped = pairStream.mapValues(new Function<String, String>() {
       @Override
-      public String call(String s) throws Exception {
-        return s.toUpperCase();
+      public String call(String s) {
+        return s.toUpperCase(Locale.ENGLISH);
       }
     });
 
@@ -1519,22 +1515,22 @@ public void testFlatMapValues() {
     List<List<Tuple2<String, String>>> inputData = stringStringKVStream;
 
     List<List<Tuple2<String, String>>> expected = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers1"),
-            new Tuple2<String, String>("california", "dodgers2"),
-            new Tuple2<String, String>("california", "giants1"),
-            new Tuple2<String, String>("california", "giants2"),
-            new Tuple2<String, String>("new york", "yankees1"),
-            new Tuple2<String, String>("new york", "yankees2"),
-            new Tuple2<String, String>("new york", "mets1"),
-            new Tuple2<String, String>("new york", "mets2")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks1"),
-            new Tuple2<String, String>("california", "sharks2"),
-            new Tuple2<String, String>("california", "ducks1"),
-            new Tuple2<String, String>("california", "ducks2"),
-            new Tuple2<String, String>("new york", "rangers1"),
-            new Tuple2<String, String>("new york", "rangers2"),
-            new Tuple2<String, String>("new york", "islanders1"),
-            new Tuple2<String, String>("new york", "islanders2")));
+        Arrays.asList(new Tuple2<>("california", "dodgers1"),
+                      new Tuple2<>("california", "dodgers2"),
+                      new Tuple2<>("california", "giants1"),
+                      new Tuple2<>("california", "giants2"),
+                      new Tuple2<>("new york", "yankees1"),
+                      new Tuple2<>("new york", "yankees2"),
+                      new Tuple2<>("new york", "mets1"),
+                      new Tuple2<>("new york", "mets2")),
+        Arrays.asList(new Tuple2<>("california", "sharks1"),
+                      new Tuple2<>("california", "sharks2"),
+                      new Tuple2<>("california", "ducks1"),
+                      new Tuple2<>("california", "ducks2"),
+                      new Tuple2<>("new york", "rangers1"),
+                      new Tuple2<>("new york", "rangers2"),
+                      new Tuple2<>("new york", "islanders1"),
+                      new Tuple2<>("new york", "islanders2")));
 
     JavaDStream<Tuple2<String, String>> stream = JavaTestUtils.attachTestInputStream(
         ssc, inputData, 1);
@@ -1545,7 +1541,7 @@ public void testFlatMapValues() {
         new Function<String, Iterable<String>>() {
           @Override
           public Iterable<String> call(String in) {
-            List<String> out = new ArrayList<String>();
+            List<String> out = new ArrayList<>();
             out.add(in + "1");
             out.add(in + "2");
             return out;
@@ -1562,29 +1558,29 @@ public Iterable<String> call(String in) {
   @Test
   public void testCoGroup() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks"),
-            new Tuple2<String, String>("new york", "rangers")));
+        Arrays.asList(new Tuple2<>("california", "dodgers"),
+                      new Tuple2<>("new york", "yankees")),
+        Arrays.asList(new Tuple2<>("california", "sharks"),
+                      new Tuple2<>("new york", "rangers")));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "giants"),
-            new Tuple2<String, String>("new york", "mets")),
-        Arrays.asList(new Tuple2<String, String>("california", "ducks"),
-            new Tuple2<String, String>("new york", "islanders")));
+        Arrays.asList(new Tuple2<>("california", "giants"),
+                      new Tuple2<>("new york", "mets")),
+        Arrays.asList(new Tuple2<>("california", "ducks"),
+                      new Tuple2<>("new york", "islanders")));
 
 
     List<List<Tuple2<String, Tuple2<List<String>, List<String>>>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("california",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("dodgers"), Arrays.asList("giants"))),
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("new york",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("yankees"), Arrays.asList("mets")))),
+            new Tuple2<>("california",
+                         new Tuple2<>(Arrays.asList("dodgers"), Arrays.asList("giants"))),
+            new Tuple2<>("new york",
+                         new Tuple2<>(Arrays.asList("yankees"), Arrays.asList("mets")))),
         Arrays.asList(
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("california",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("sharks"), Arrays.asList("ducks"))),
-            new Tuple2<String, Tuple2<List<String>, List<String>>>("new york",
-                new Tuple2<List<String>, List<String>>(Arrays.asList("rangers"), Arrays.asList("islanders")))));
+            new Tuple2<>("california",
+                         new Tuple2<>(Arrays.asList("sharks"), Arrays.asList("ducks"))),
+            new Tuple2<>("new york",
+                         new Tuple2<>(Arrays.asList("rangers"), Arrays.asList("islanders")))));
 
 
     JavaDStream<Tuple2<String, String>> stream1 = JavaTestUtils.attachTestInputStream(
@@ -1620,29 +1616,29 @@ public void testCoGroup() {
   @Test
   public void testJoin() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks"),
-            new Tuple2<String, String>("new york", "rangers")));
+        Arrays.asList(new Tuple2<>("california", "dodgers"),
+                      new Tuple2<>("new york", "yankees")),
+        Arrays.asList(new Tuple2<>("california", "sharks"),
+                      new Tuple2<>("new york", "rangers")));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "giants"),
-            new Tuple2<String, String>("new york", "mets")),
-        Arrays.asList(new Tuple2<String, String>("california", "ducks"),
-            new Tuple2<String, String>("new york", "islanders")));
+        Arrays.asList(new Tuple2<>("california", "giants"),
+                      new Tuple2<>("new york", "mets")),
+        Arrays.asList(new Tuple2<>("california", "ducks"),
+                      new Tuple2<>("new york", "islanders")));
 
 
     List<List<Tuple2<String, Tuple2<String, String>>>> expected = Arrays.asList(
         Arrays.asList(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("dodgers", "giants")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("yankees", "mets"))),
+            new Tuple2<>("california",
+                         new Tuple2<>("dodgers", "giants")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("yankees", "mets"))),
         Arrays.asList(
-            new Tuple2<String, Tuple2<String, String>>("california",
-                new Tuple2<String, String>("sharks", "ducks")),
-            new Tuple2<String, Tuple2<String, String>>("new york",
-                new Tuple2<String, String>("rangers", "islanders"))));
+            new Tuple2<>("california",
+                         new Tuple2<>("sharks", "ducks")),
+            new Tuple2<>("new york",
+                         new Tuple2<>("rangers", "islanders"))));
 
 
     JavaDStream<Tuple2<String, String>> stream1 = JavaTestUtils.attachTestInputStream(
@@ -1664,13 +1660,13 @@ public void testJoin() {
   @Test
   public void testLeftOuterJoin() {
     List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "dodgers"),
-            new Tuple2<String, String>("new york", "yankees")),
-        Arrays.asList(new Tuple2<String, String>("california", "sharks") ));
+        Arrays.asList(new Tuple2<>("california", "dodgers"),
+                      new Tuple2<>("new york", "yankees")),
+        Arrays.asList(new Tuple2<>("california", "sharks") ));
 
     List<List<Tuple2<String, String>>> stringStringKVStream2 = Arrays.asList(
-        Arrays.asList(new Tuple2<String, String>("california", "giants") ),
-        Arrays.asList(new Tuple2<String, String>("new york", "islanders") )
+        Arrays.asList(new Tuple2<>("california", "giants") ),
+        Arrays.asList(new Tuple2<>("new york", "islanders") )
 
     );
 
@@ -1713,7 +1709,7 @@ public void testCheckpointMasterRecovery() throws InterruptedException {
     JavaDStream<String> stream = JavaCheckpointTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream<Integer> letterCount = stream.map(new Function<String, Integer>() {
       @Override
-      public Integer call(String s) throws Exception {
+      public Integer call(String s) {
         return s.length();
       }
     });
@@ -1752,6 +1748,7 @@ public void testContextGetOrCreate() throws InterruptedException {
     // (used to detect the new context)
     final AtomicBoolean newContextCreated = new AtomicBoolean(false);
     Function0<JavaStreamingContext> creatingFunc = new Function0<JavaStreamingContext>() {
+      @Override
       public JavaStreamingContext call() {
         newContextCreated.set(true);
         return new JavaStreamingContext(conf, Seconds.apply(1));
@@ -1765,20 +1762,20 @@ public JavaStreamingContext call() {
 
     newContextCreated.set(false);
     ssc = JavaStreamingContext.getOrCreate(corruptedCheckpointDir, creatingFunc,
-        new org.apache.hadoop.conf.Configuration(), true);
+        new Configuration(), true);
     Assert.assertTrue("new context not created", newContextCreated.get());
     ssc.stop();
 
     newContextCreated.set(false);
     ssc = JavaStreamingContext.getOrCreate(checkpointDir, creatingFunc,
-        new org.apache.hadoop.conf.Configuration());
+        new Configuration());
     Assert.assertTrue("old context not recovered", !newContextCreated.get());
     ssc.stop();
 
     newContextCreated.set(false);
     JavaSparkContext sc = new JavaSparkContext(conf);
     ssc = JavaStreamingContext.getOrCreate(checkpointDir, creatingFunc,
-        new org.apache.hadoop.conf.Configuration());
+        new Configuration());
     Assert.assertTrue("old context not recovered", !newContextCreated.get());
     ssc.stop();
   }
@@ -1800,7 +1797,7 @@ public void testCheckpointofIndividualStream() throws InterruptedException {
     JavaDStream stream = JavaCheckpointTestUtils.attachTestInputStream(ssc, inputData, 1);
     JavaDStream letterCount = stream.map(new Function<String, Integer>() {
       @Override
-      public Integer call(String s) throws Exception {
+      public Integer call(String s) {
         return s.length();
       }
     });
@@ -1818,29 +1815,26 @@ public Integer call(String s) throws Exception {
   // InputStream functionality is deferred to the existing Scala tests.
   @Test
   public void testSocketTextStream() {
-      JavaReceiverInputDStream<String> test = ssc.socketTextStream("localhost", 12345);
+    ssc.socketTextStream("localhost", 12345);
   }
 
   @Test
   public void testSocketString() {
-
-    class Converter implements Function<InputStream, Iterable<String>> {
-      public Iterable<String> call(InputStream in) throws IOException {
-        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
-        List<String> out = new ArrayList<String>();
-        while (true) {
-          String line = reader.readLine();
-          if (line == null) { break; }
-          out.add(line);
-        }
-        return out;
-      }
-    }
-
-    JavaDStream<String> test = ssc.socketStream(
+    ssc.socketStream(
       "localhost",
       12345,
-      new Converter(),
+      new Function<InputStream, Iterable<String>>() {
+        @Override
+        public Iterable<String> call(InputStream in) throws IOException {
+          List<String> out = new ArrayList<>();
+          try (BufferedReader reader = new BufferedReader(new InputStreamReader(in))) {
+            for (String line; (line = reader.readLine()) != null;) {
+              out.add(line);
+            }
+          }
+          return out;
+        }
+      },
       StorageLevel.MEMORY_ONLY());
   }
 
@@ -1870,7 +1864,7 @@ public void testFileStream() throws IOException {
       TextInputFormat.class,
       new Function<Path, Boolean>() {
         @Override
-        public Boolean call(Path v1) throws Exception {
+        public Boolean call(Path v1) {
           return Boolean.TRUE;
         }
       },
@@ -1879,7 +1873,7 @@ public Boolean call(Path v1) throws Exception {
     JavaDStream<String> test = inputStream.map(
       new Function<Tuple2<LongWritable, Text>, String>() {
         @Override
-        public String call(Tuple2<LongWritable, Text> v1) throws Exception {
+        public String call(Tuple2<LongWritable, Text> v1) {
           return v1._2().toString();
         }
     });
@@ -1892,19 +1886,15 @@ public String call(Tuple2<LongWritable, Text> v1) throws Exception {
 
   @Test
   public void testRawSocketStream() {
-    JavaReceiverInputDStream<String> test = ssc.rawSocketStream("localhost", 12345);
+    ssc.rawSocketStream("localhost", 12345);
   }
 
-  private List<List<String>> fileTestPrepare(File testDir) throws IOException {
+  private static List<List<String>> fileTestPrepare(File testDir) throws IOException {
     File existingFile = new File(testDir, "0");
     Files.write("0\n", existingFile, Charset.forName("UTF-8"));
-    assertTrue(existingFile.setLastModified(1000) && existingFile.lastModified() == 1000);
-
-    List<List<String>> expected = Arrays.asList(
-      Arrays.asList("0")
-    );
-
-    return expected;
+    Assert.assertTrue(existingFile.setLastModified(1000));
+    Assert.assertEquals(1000, existingFile.lastModified());
+    return Arrays.asList(Arrays.asList("0"));
   }
 
   @SuppressWarnings("unchecked")
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java
index 1b0787fe69dec..ec2bffd6a5b97 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaReceiverAPISuite.java
@@ -36,7 +36,6 @@
 import java.io.Serializable;
 import java.net.ConnectException;
 import java.net.Socket;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
 public class JavaReceiverAPISuite implements Serializable {
@@ -64,16 +63,16 @@ public void testReceiver() throws InterruptedException {
         ssc.receiverStream(new JavaSocketReceiver("localhost", server.port()));
       JavaDStream<String> mapped = input.map(new Function<String, String>() {
         @Override
-        public String call(String v1) throws Exception {
+        public String call(String v1) {
           return v1 + ".";
         }
       });
       mapped.foreachRDD(new Function<JavaRDD<String>, Void>() {
         @Override
-        public Void call(JavaRDD<String> rdd) throws Exception {
-        long count = rdd.count();
-        dataCounter.addAndGet(count);
-        return null;
+        public Void call(JavaRDD<String> rdd) {
+          long count = rdd.count();
+          dataCounter.addAndGet(count);
+          return null;
         }
       });
 
@@ -83,7 +82,7 @@ public Void call(JavaRDD<String> rdd) throws Exception {
 
       Thread.sleep(200);
       for (int i = 0; i < 6; i++) {
-        server.send("" + i + "\n"); // \n to make sure these are separate lines
+        server.send(i + "\n"); // \n to make sure these are separate lines
         Thread.sleep(100);
       }
       while (dataCounter.get() == 0 && System.currentTimeMillis() - startTime < timeout) {
@@ -95,50 +94,49 @@ public Void call(JavaRDD<String> rdd) throws Exception {
       server.stop();
     }
   }
-}
 
-class JavaSocketReceiver extends Receiver<String> {
+  private static class JavaSocketReceiver extends Receiver<String> {
 
-  String host = null;
-  int port = -1;
+    String host = null;
+    int port = -1;
 
-  public JavaSocketReceiver(String host_ , int port_) {
-    super(StorageLevel.MEMORY_AND_DISK());
-    host = host_;
-    port = port_;
-  }
+    JavaSocketReceiver(String host_ , int port_) {
+      super(StorageLevel.MEMORY_AND_DISK());
+      host = host_;
+      port = port_;
+    }
 
-  @Override
-  public void onStart() {
-    new Thread()  {
-      @Override public void run() {
-        receive();
-      }
-    }.start();
-  }
+    @Override
+    public void onStart() {
+      new Thread()  {
+        @Override public void run() {
+          receive();
+        }
+      }.start();
+    }
 
-  @Override
-  public void onStop() {
-  }
+    @Override
+    public void onStop() {
+    }
 
-  private void receive() {
-    Socket socket = null;
-    try {
-      socket = new Socket(host, port);
-      BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream()));
-      String userInput;
-      while ((userInput = in.readLine()) != null) {
-        store(userInput);
+    private void receive() {
+      try {
+        Socket socket = new Socket(host, port);
+        BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream()));
+        String userInput;
+        while ((userInput = in.readLine()) != null) {
+          store(userInput);
+        }
+        in.close();
+        socket.close();
+      } catch(ConnectException ce) {
+        ce.printStackTrace();
+        restart("Could not connect", ce);
+      } catch(Throwable t) {
+        t.printStackTrace();
+        restart("Error receiving data", t);
       }
-      in.close();
-      socket.close();
-    } catch(ConnectException ce) {
-      ce.printStackTrace();
-      restart("Could not connect", ce);
-    } catch(Throwable t) {
-      t.printStackTrace();
-      restart("Error receiving data", t);
     }
   }
-}
 
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index 1bba7a143edf2..a6956533c07a5 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -408,10 +408,14 @@ class CheckpointSuite extends TestSuiteBase {
 
     ssc = new StreamingContext(checkpointDir)
     ssc.start()
-    val outputNew = advanceTimeWithRealDelay(ssc, 2)
 
     eventually(timeout(10.seconds)) {
       assert(RateTestReceiver.getActive().nonEmpty)
+    }
+
+    advanceTimeWithRealDelay(ssc, 2)
+
+    eventually(timeout(10.seconds)) {
       assert(RateTestReceiver.getActive().get.getDefaultBlockGeneratorRateLimit() === 200)
     }
     ssc.stop()
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index ec2852d9a0206..047e38ef90998 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -76,6 +76,9 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
           fail("Timeout: cannot finish all batches in 30 seconds")
         }
 
+        // Ensure progress listener has been notified of all events
+        ssc.scheduler.listenerBus.waitUntilEmpty(500)
+
         // Verify all "InputInfo"s have been reported
         assert(ssc.progressListener.numTotalReceivedRecords === input.size)
         assert(ssc.progressListener.numTotalProcessedRecords === input.size)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 6c0c926755c20..13cfe29d7b304 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -29,7 +29,7 @@ import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.network.nio.NioBlockTransferService
+import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.KryoSerializer
@@ -47,7 +47,9 @@ class ReceivedBlockHandlerSuite
   with Matchers
   with Logging {
 
-  val conf = new SparkConf().set("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs", "1")
+  val conf = new SparkConf()
+    .set("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs", "1")
+    .set("spark.app.id", "streaming-test")
   val hadoopConf = new Configuration()
   val streamId = 1
   val securityMgr = new SecurityManager(conf)
@@ -184,7 +186,7 @@ class ReceivedBlockHandlerSuite
   }
 
   test("Test Block - isFullyConsumed") {
-    val sparkConf = new SparkConf()
+    val sparkConf = new SparkConf().set("spark.app.id", "streaming-test")
     sparkConf.set("spark.storage.unrollMemoryThreshold", "512")
     // spark.storage.unrollFraction set to 0.4 for BlockManager
     sparkConf.set("spark.storage.unrollFraction", "0.4")
@@ -251,7 +253,7 @@ class ReceivedBlockHandlerSuite
       maxMem: Long,
       conf: SparkConf,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
-    val transfer = new NioBlockTransferService(conf, securityMgr)
+    val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
     val manager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, maxMem, conf,
       mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
     manager.initialize("app-id")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 7423ef6bcb6ea..c7a877142b374 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -30,7 +30,7 @@ import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
 
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark._
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.metrics.source.Source
 import org.apache.spark.storage.StorageLevel
@@ -180,6 +180,38 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     assert(ssc.scheduler.isStarted === false)
   }
 
+  test("start should set job group and description of streaming jobs correctly") {
+    ssc = new StreamingContext(conf, batchDuration)
+    ssc.sc.setJobGroup("non-streaming", "non-streaming", true)
+    val sc = ssc.sc
+
+    @volatile var jobGroupFound: String = ""
+    @volatile var jobDescFound: String = ""
+    @volatile var jobInterruptFound: String = ""
+    @volatile var allFound: Boolean = false
+
+    addInputStream(ssc).foreachRDD { rdd =>
+      jobGroupFound = sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID)
+      jobDescFound = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
+      jobInterruptFound = sc.getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL)
+      allFound = true
+    }
+    ssc.start()
+
+    eventually(timeout(10 seconds), interval(10 milliseconds)) {
+      assert(allFound === true)
+    }
+
+    // Verify streaming jobs have expected thread-local properties
+    assert(jobGroupFound === null)
+    assert(jobDescFound.contains("Streaming job from"))
+    assert(jobInterruptFound === "false")
+
+    // Verify current thread's thread-local properties have not changed
+    assert(sc.getLocalProperty(SparkContext.SPARK_JOB_GROUP_ID) === "non-streaming")
+    assert(sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION) === "non-streaming")
+    assert(sc.getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL) === "true")
+  }
 
   test("start multiple times") {
     ssc = new StreamingContext(master, appName, batchDuration)
@@ -726,16 +758,26 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
   }
 
   test("queueStream doesn't support checkpointing") {
-    val checkpointDir = Utils.createTempDir()
-    ssc = new StreamingContext(master, appName, batchDuration)
-    val rdd = ssc.sparkContext.parallelize(1 to 10)
-    ssc.queueStream[Int](Queue(rdd)).print()
-    ssc.checkpoint(checkpointDir.getAbsolutePath)
-    val e = intercept[NotSerializableException] {
-      ssc.start()
+    val checkpointDirectory = Utils.createTempDir().getAbsolutePath()
+    def creatingFunction(): StreamingContext = {
+      val _ssc = new StreamingContext(conf, batchDuration)
+      val rdd = _ssc.sparkContext.parallelize(1 to 10)
+      _ssc.checkpoint(checkpointDirectory)
+      _ssc.queueStream[Int](Queue(rdd)).register()
+      _ssc
+    }
+    ssc = StreamingContext.getOrCreate(checkpointDirectory, creatingFunction _)
+    ssc.start()
+    eventually(timeout(10000 millis)) {
+      assert(Checkpoint.getCheckpointFiles(checkpointDirectory).size > 1)
+    }
+    ssc.stop()
+    val e = intercept[SparkException] {
+      ssc = StreamingContext.getOrCreate(checkpointDirectory, creatingFunction _)
     }
     // StreamingContext.validate changes the message, so use "contains" here
-    assert(e.getMessage.contains("queueStream doesn't support checkpointing"))
+    assert(e.getCause.getMessage.contains("queueStream doesn't support checkpointing. " +
+      "Please don't use queueStream when checkpointing is enabled."))
   }
 
   def addInputStream(s: StreamingContext): DStream[Int] = {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index d840c349bbbc4..d8fd2ced3bf3b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -140,6 +140,69 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
     }
   }
 
+  test("onBatchCompleted with successful batch") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD(_.count)
+
+    val failureReasons = startStreamingContextAndCollectFailureReasons(ssc)
+    assert(failureReasons != null && failureReasons.isEmpty,
+      "A successful batch should not set errorMessage")
+  }
+
+  test("onBatchCompleted with failed batch and one failed job") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD { _ =>
+      throw new RuntimeException("This is a failed job")
+    }
+
+    // Check if failureReasons contains the correct error message
+    val failureReasons = startStreamingContextAndCollectFailureReasons(ssc, isFailed = true)
+    assert(failureReasons != null)
+    assert(failureReasons.size === 1)
+    assert(failureReasons.contains(0))
+    assert(failureReasons(0).contains("This is a failed job"))
+  }
+
+  test("onBatchCompleted with failed batch and multiple failed jobs") {
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
+    inputStream.foreachRDD { _ =>
+      throw new RuntimeException("This is a failed job")
+    }
+    inputStream.foreachRDD { _ =>
+      throw new RuntimeException("This is another failed job")
+    }
+
+    // Check if failureReasons contains the correct error messages
+    val failureReasons =
+      startStreamingContextAndCollectFailureReasons(ssc, isFailed = true)
+    assert(failureReasons != null)
+    assert(failureReasons.size === 2)
+    assert(failureReasons.contains(0))
+    assert(failureReasons.contains(1))
+    assert(failureReasons(0).contains("This is a failed job"))
+    assert(failureReasons(1).contains("This is another failed job"))
+  }
+
+  private def startStreamingContextAndCollectFailureReasons(
+      _ssc: StreamingContext, isFailed: Boolean = false): Map[Int, String] = {
+    val failureReasonsCollector = new FailureReasonsCollector()
+    _ssc.addStreamingListener(failureReasonsCollector)
+    val batchCounter = new BatchCounter(_ssc)
+    _ssc.start()
+    // Make sure running at least one batch
+    batchCounter.waitUntilBatchesCompleted(expectedNumCompletedBatches = 1, timeout = 10000)
+    if (isFailed) {
+      intercept[RuntimeException] {
+        _ssc.awaitTerminationOrTimeout(10000)
+      }
+    }
+    _ssc.stop()
+    failureReasonsCollector.failureReasons
+  }
+
   /** Check if a sequence of numbers is in increasing order */
   def isInIncreasingOrder(seq: Seq[Long]): Boolean = {
     for (i <- 1 until seq.size) {
@@ -205,3 +268,16 @@ class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_O
   }
   def onStop() { }
 }
+
+/**
+ * A StreamingListener that saves the latest `failureReasons` in `BatchInfo` to the `failureReasons`
+ * field.
+ */
+class FailureReasonsCollector extends StreamingListener {
+
+  @volatile var failureReasons: Map[Int, String] = null
+
+  override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
+    failureReasons = batchCompleted.batchInfo.failureReasons
+  }
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
index a38cc603f2190..2f11b255f1104 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
@@ -184,9 +184,10 @@ class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter {
     // Verify that the final data is present in the final generated block and
     // pushed before complete stop
     assert(blockGenerator.isStopped() === false) // generator has not stopped yet
-    clock.advance(blockIntervalMs)   // force block generation
-    failAfter(1 second) {
-      thread.join()
+    eventually(timeout(10 seconds), interval(10 milliseconds)) {
+      // Keep calling `advance` to avoid blocking forever in `clock.waitTillTime`
+      clock.advance(blockIntervalMs)
+      assert(thread.isAlive === false)
     }
     assert(blockGenerator.isStopped() === true) // generator has finally been completely stopped
     assert(listener.pushedData === data, "All data not pushed by stop()")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index dd292ba4dd949..45138b748ecab 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -60,6 +60,26 @@ class ReceiverTrackerSuite extends TestSuiteBase {
       }
     }
   }
+
+  test("should restart receiver after stopping it") {
+    withStreamingContext(new StreamingContext(conf, Milliseconds(100))) { ssc =>
+      @volatile var startTimes = 0
+      ssc.addStreamingListener(new StreamingListener {
+        override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
+          startTimes += 1
+        }
+      })
+      val input = ssc.receiverStream(new StoppableReceiver)
+      val output = new TestOutputStream(input)
+      output.register()
+      ssc.start()
+      StoppableReceiver.shouldStop = true
+      eventually(timeout(10 seconds), interval(10 millis)) {
+        // The receiver is stopped once, so if it's restarted, it should be started twice.
+        assert(startTimes === 2)
+      }
+    }
+  }
 }
 
 /** An input DStream with for testing rate controlling */
@@ -132,3 +152,34 @@ private[streaming] object RateTestReceiver {
 
   def getActive(): Option[RateTestReceiver] = Option(activeReceiver)
 }
+
+/**
+ * A custom receiver that could be stopped via StoppableReceiver.shouldStop
+ */
+class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
+
+  var receivingThreadOption: Option[Thread] = None
+
+  def onStart() {
+    val thread = new Thread() {
+      override def run() {
+        while (!StoppableReceiver.shouldStop) {
+          Thread.sleep(10)
+        }
+        StoppableReceiver.this.stop("stop")
+      }
+    }
+    thread.start()
+  }
+
+  def onStop() {
+    StoppableReceiver.shouldStop = true
+    receivingThreadOption.foreach(_.join())
+    // Reset it so as to restart it
+    StoppableReceiver.shouldStop = false
+  }
+}
+
+object StoppableReceiver {
+  @volatile var shouldStop = false
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala
new file mode 100644
index 0000000000000..0544972d95c03
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.util
+
+import scala.collection.mutable
+import scala.concurrent.duration._
+
+import org.scalatest.PrivateMethodTester
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.ManualClock
+
+class RecurringTimerSuite extends SparkFunSuite with PrivateMethodTester {
+
+  test("basic") {
+    val clock = new ManualClock()
+    val results = new mutable.ArrayBuffer[Long]() with mutable.SynchronizedBuffer[Long]
+    val timer = new RecurringTimer(clock, 100, time => {
+      results += time
+    }, "RecurringTimerSuite-basic")
+    timer.start(0)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L))
+    }
+    clock.advance(100)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L, 100L))
+    }
+    clock.advance(200)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L, 100L, 200L, 300L))
+    }
+    assert(timer.stop(interruptTimer = true) === 300L)
+  }
+
+  test("SPARK-10224: call 'callback' after stopping") {
+    val clock = new ManualClock()
+    val results = new mutable.ArrayBuffer[Long]() with mutable.SynchronizedBuffer[Long]
+    val timer = new RecurringTimer(clock, 100, time => {
+      results += time
+    }, "RecurringTimerSuite-SPARK-10224")
+    timer.start(0)
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(results === Seq(0L))
+    }
+    @volatile var lastTime = -1L
+    // Now RecurringTimer is waiting for the next interval
+    val thread = new Thread {
+      override def run(): Unit = {
+        lastTime = timer.stop(interruptTimer = false)
+      }
+    }
+    thread.start()
+    val stopped = PrivateMethod[RecurringTimer]('stopped)
+    // Make sure the `stopped` field has been changed
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      assert(timer.invokePrivate(stopped()) === true)
+    }
+    clock.advance(200)
+    // When RecurringTimer is awake from clock.waitTillTime, it will call `callback` once.
+    // Then it will find `stopped` is true and exit the loop, but it should call `callback` again
+    // before exiting its internal thread.
+    thread.join()
+    assert(results === Seq(0L, 100L, 200L))
+    assert(lastTime === 200L)
+  }
+}
diff --git a/tools/pom.xml b/tools/pom.xml
index 298ee2348b58e..1e64f280e5bed 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 89475ee3cf5a1..066abe92e51c0 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index cbcab958c05a9..216aeea60d1c8 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -90,11 +90,7 @@ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
    * Creates an UTF8String from given address (base and offset) and length.
    */
   public static UTF8String fromAddress(Object base, long offset, int numBytes) {
-    if (base != null) {
-      return new UTF8String(base, offset, numBytes);
-    } else {
-      return null;
-    }
+    return new UTF8String(base, offset, numBytes);
   }
 
   /**
diff --git a/yarn/pom.xml b/yarn/pom.xml
index f6737695307a2..d8e4a4bbead81 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.6.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 991b5cec00bd8..07a0a45e60cbe 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -345,7 +345,7 @@ private[spark] class ApplicationMaster(
             if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
               finish(FinalApplicationStatus.FAILED,
                 ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES,
-                "Max number of executor failures reached")
+                s"Max number of executor failures ($maxNumExecutorFailures) reached")
             } else {
               logDebug("Sending progress")
               allocator.allocateResources()
@@ -590,6 +590,13 @@ private[spark] class ApplicationMaster(
           case None => logWarning("Container allocator is not ready to kill executors yet.")
         }
         context.reply(true)
+
+      case GetExecutorLossReason(eid) =>
+        Option(allocator) match {
+          case Some(a) => a.enqueueGetLossReasonRequest(eid, context)
+          case None => logWarning(s"Container allocator is not ready to find" +
+            s" executor loss reasons yet.")
+        }
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
index b08412414aa1c..17d9943c795e3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
@@ -105,9 +105,9 @@ class ApplicationMasterArguments(val args: Array[String]) {
       |                       place on the PYTHONPATH for Python apps.
       |  --args ARGS          Arguments to be passed to your application's main class.
       |                       Multiple invocations are possible, each will be passed in order.
-      |  --num-executors NUM    Number of executors to start (Default: 2)
       |  --executor-cores NUM   Number of cores for the executors (Default: 1)
       |  --executor-memory MEM  Memory per executor (e.g. 1000M, 2G) (Default: 1G)
+      |  --properties-file FILE Path to a custom Spark properties file.
       """.stripMargin)
     // scalastyle:on println
     System.exit(exitCode)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index e9a02baafd28e..8c53c24a79c48 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -86,7 +86,11 @@ private[spark] class Client(
   private val fireAndForget = isClusterMode &&
     !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)
 
-  def stop(): Unit = yarnClient.stop()
+  def stop(): Unit = {
+    yarnClient.stop()
+    // Unset YARN mode system env variable, to allow switching between cluster types.
+    System.clearProperty("SPARK_YARN_MODE")
+  }
 
   /**
    * Submit an application running our ApplicationMaster to the ResourceManager.
@@ -1045,7 +1049,10 @@ object Client extends Logging {
           s"in favor of the $CONF_SPARK_JAR configuration variable.")
       System.getenv(ENV_SPARK_JAR)
     } else {
-      SparkContext.jarOfClass(this.getClass).head
+      SparkContext.jarOfClass(this.getClass).getOrElse(throw new SparkException("Could not "
+        + "find jar containing Spark classes. The jar can be defined using the "
+        + "spark.yarn.jar configuration option. If testing Spark, either set that option or "
+        + "make sure SPARK_PREPEND_CLASSES is not set."))
     }
   }
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index 4f42ffefa77f9..1165061db21e3 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -81,22 +81,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       .orNull
     // If dynamic allocation is enabled, start at the configured initial number of executors.
     // Default to minExecutors if no initialExecutors is set.
-    if (isDynamicAllocationEnabled) {
-      val minExecutorsConf = "spark.dynamicAllocation.minExecutors"
-      val initialExecutorsConf = "spark.dynamicAllocation.initialExecutors"
-      val maxExecutorsConf = "spark.dynamicAllocation.maxExecutors"
-      val minNumExecutors = sparkConf.getInt(minExecutorsConf, 0)
-      val initialNumExecutors = sparkConf.getInt(initialExecutorsConf, minNumExecutors)
-      val maxNumExecutors = sparkConf.getInt(maxExecutorsConf, Integer.MAX_VALUE)
-
-      // If defined, initial executors must be between min and max
-      if (initialNumExecutors < minNumExecutors || initialNumExecutors > maxNumExecutors) {
-        throw new IllegalArgumentException(
-          s"$initialExecutorsConf must be between $minExecutorsConf and $maxNumExecutors!")
-      }
-
-      numExecutors = initialNumExecutors
-    }
+    numExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
     principal = Option(principal)
       .orElse(sparkConf.getOption("spark.yarn.principal"))
       .orNull
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 5f897cbcb4e9f..9e1ef1b3b4229 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -21,8 +21,9 @@ import java.util.Collections
 import java.util.concurrent._
 import java.util.regex.Pattern
 
-import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+import scala.collection.JavaConverters._
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
 
@@ -36,8 +37,9 @@ import org.apache.log4j.{Level, Logger}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
+import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
 import org.apache.spark.util.Utils
 
 /**
@@ -87,11 +89,12 @@ private[yarn] class YarnAllocator(
   @volatile private var numExecutorsFailed = 0
 
   @volatile private var targetNumExecutors =
-    if (Utils.isDynamicAllocationEnabled(sparkConf)) {
-      sparkConf.getInt("spark.dynamicAllocation.initialExecutors", 0)
-    } else {
-      sparkConf.getInt("spark.executor.instances", YarnSparkHadoopUtil.DEFAULT_NUMBER_EXECUTORS)
-    }
+    YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
+
+  // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
+  // list of requesters that should be responded to once we find out why the given executor
+  // was lost.
+  private val pendingLossReasonRequests = new HashMap[String, mutable.Buffer[RpcCallContext]]
 
   // Keep track of which container is running which executor to remove the executors later
   // Visible for testing.
@@ -235,9 +238,7 @@ private[yarn] class YarnAllocator(
     val completedContainers = allocateResponse.getCompletedContainersStatuses()
     if (completedContainers.size > 0) {
       logDebug("Completed %d containers".format(completedContainers.size))
-
       processCompletedContainers(completedContainers.asScala)
-
       logDebug("Finished processing %d completed containers. Current running executor count: %d."
         .format(completedContainers.size, numExecutorsRunning))
     }
@@ -429,7 +430,7 @@ private[yarn] class YarnAllocator(
     for (completedContainer <- completedContainers) {
       val containerId = completedContainer.getContainerId
       val alreadyReleased = releasedContainers.remove(containerId)
-      if (!alreadyReleased) {
+      val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
         numExecutorsRunning -= 1
@@ -440,22 +441,42 @@ private[yarn] class YarnAllocator(
         // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
         // there are some exit status' we shouldn't necessarily count against us, but for
         // now I think its ok as none of the containers are expected to exit
-        if (completedContainer.getExitStatus == ContainerExitStatus.PREEMPTED) {
-          logInfo("Container preempted: " + containerId)
-        } else if (completedContainer.getExitStatus == -103) { // vmem limit exceeded
-          logWarning(memLimitExceededLogMessage(
-            completedContainer.getDiagnostics,
-            VMEM_EXCEEDED_PATTERN))
-        } else if (completedContainer.getExitStatus == -104) { // pmem limit exceeded
-          logWarning(memLimitExceededLogMessage(
-            completedContainer.getDiagnostics,
-            PMEM_EXCEEDED_PATTERN))
-        } else if (completedContainer.getExitStatus != 0) {
-          logInfo("Container marked as failed: " + containerId +
-            ". Exit status: " + completedContainer.getExitStatus +
-            ". Diagnostics: " + completedContainer.getDiagnostics)
-          numExecutorsFailed += 1
+        val exitStatus = completedContainer.getExitStatus
+        val (isNormalExit, containerExitReason) = exitStatus match {
+          case ContainerExitStatus.SUCCESS =>
+            (true, s"Executor for container $containerId exited normally.")
+          case ContainerExitStatus.PREEMPTED =>
+            // Preemption should count as a normal exit, since YARN preempts containers merely
+            // to do resource sharing, and tasks that fail due to preempted executors could
+            // just as easily finish on any other executor. See SPARK-8167.
+            (true, s"Container $containerId was preempted.")
+          // Should probably still count memory exceeded exit codes towards task failures
+          case VMEM_EXCEEDED_EXIT_CODE =>
+            (false, memLimitExceededLogMessage(
+              completedContainer.getDiagnostics,
+              VMEM_EXCEEDED_PATTERN))
+          case PMEM_EXCEEDED_EXIT_CODE =>
+            (false, memLimitExceededLogMessage(
+              completedContainer.getDiagnostics,
+              PMEM_EXCEEDED_PATTERN))
+          case unknown =>
+            numExecutorsFailed += 1
+            (false, "Container marked as failed: " + containerId +
+              ". Exit status: " + completedContainer.getExitStatus +
+              ". Diagnostics: " + completedContainer.getDiagnostics)
+
+        }
+        if (isNormalExit) {
+          logInfo(containerExitReason)
+        } else {
+          logWarning(containerExitReason)
         }
+        ExecutorExited(0, isNormalExit, containerExitReason)
+      } else {
+        // If we have already released this container, then it must mean
+        // that the driver has explicitly requested it to be killed
+        ExecutorExited(completedContainer.getExitStatus, isNormalExit = true,
+          s"Container $containerId exited from explicit termination request.")
       }
 
       if (allocatedContainerToHostMap.contains(containerId)) {
@@ -474,18 +495,35 @@ private[yarn] class YarnAllocator(
 
       containerIdToExecutorId.remove(containerId).foreach { eid =>
         executorIdToContainer.remove(eid)
-
+        pendingLossReasonRequests.remove(eid).foreach { pendingRequests =>
+          // Notify application of executor loss reasons so it can decide whether it should abort
+          pendingRequests.foreach(_.reply(exitReason))
+        }
         if (!alreadyReleased) {
           // The executor could have gone away (like no route to host, node failure, etc)
           // Notify backend about the failure of the executor
           numUnexpectedContainerRelease += 1
-          driverRef.send(RemoveExecutor(eid,
-            s"Yarn deallocated the executor $eid (container $containerId)"))
+          driverRef.send(RemoveExecutor(eid, exitReason))
         }
       }
     }
   }
 
+  /**
+   * Register that some RpcCallContext has asked the AM why the executor was lost. Note that
+   * we can only find the loss reason to send back in the next call to allocateResources().
+   */
+  private[yarn] def enqueueGetLossReasonRequest(
+      eid: String,
+      context: RpcCallContext): Unit = synchronized {
+    if (executorIdToContainer.contains(eid)) {
+      pendingLossReasonRequests
+        .getOrElseUpdate(eid, new ArrayBuffer[RpcCallContext]) += context
+    } else {
+      logWarning(s"Tried to get the loss reason for non-existent executor $eid")
+    }
+  }
+
   private def internalReleaseContainer(container: Container): Unit = {
     releasedContainers.add(container.getId())
     amClient.releaseAssignedContainer(container.getId())
@@ -501,6 +539,8 @@ private object YarnAllocator {
     Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
   val VMEM_EXCEEDED_PATTERN =
     Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
+  val VMEM_EXCEEDED_EXIT_CODE = -103
+  val PMEM_EXCEEDED_EXIT_CODE = -104
 
   def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
     val matcher = pattern.matcher(diagnostics)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 445d3dcd266db..f276e7efde9d7 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -314,5 +314,28 @@ object YarnSparkHadoopUtil {
   def getClassPathSeparator(): String = {
     classPathSeparatorField.get(null).asInstanceOf[String]
   }
+
+  /**
+   * Getting the initial target number of executors depends on whether dynamic allocation is
+   * enabled.
+   */
+  def getInitialTargetExecutorNumber(conf: SparkConf): Int = {
+    if (Utils.isDynamicAllocationEnabled(conf)) {
+      val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
+      val initialNumExecutors =
+        conf.getInt("spark.dynamicAllocation.initialExecutors", minNumExecutors)
+      val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", Int.MaxValue)
+      require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
+        s"initial executor number $initialNumExecutors must between min executor number" +
+          s"$minNumExecutors and max executor number $maxNumExecutors")
+
+      initialNumExecutors
+    } else {
+      val targetNumExecutors =
+        sys.env.get("SPARK_EXECUTOR_INSTANCES").map(_.toInt).getOrElse(DEFAULT_NUMBER_EXECUTORS)
+      // System property can override environment variable.
+      conf.getInt("spark.executor.instances", targetNumExecutors)
+    }
+  }
 }
 
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index 1aed5a1675075..50b699f11b21c 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -17,21 +17,13 @@
 
 package org.apache.spark.scheduler.cluster
 
-import java.net.NetworkInterface
-
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.yarn.api.records.NodeState
-import org.apache.hadoop.yarn.client.api.YarnClient
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 
 import org.apache.spark.SparkContext
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.util.{IntParam, Utils}
+import org.apache.spark.util.Utils
 
 private[spark] class YarnClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
@@ -40,13 +32,7 @@ private[spark] class YarnClusterSchedulerBackend(
 
   override def start() {
     super.start()
-    totalExpectedExecutors = DEFAULT_NUMBER_EXECUTORS
-    if (System.getenv("SPARK_EXECUTOR_INSTANCES") != null) {
-      totalExpectedExecutors = IntParam.unapply(System.getenv("SPARK_EXECUTOR_INSTANCES"))
-        .getOrElse(totalExpectedExecutors)
-    }
-    // System property can override environment variable.
-    totalExpectedExecutors = sc.getConf.getInt("spark.executor.instances", totalExpectedExecutors)
+    totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sc.conf)
   }
 
   override def applicationId(): String =
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
index 49bee0866dd43..e1c67db76571f 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -30,6 +30,7 @@ import org.scalatest.Matchers
 import org.apache.hadoop.yarn.api.records.ApplicationAccessType
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException, SparkFunSuite}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.util.Utils
 
 
@@ -233,4 +234,15 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
       }
     assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
   }
+
+  test("check different hadoop utils based on env variable") {
+    try {
+      System.setProperty("SPARK_YARN_MODE", "true")
+      assert(SparkHadoopUtil.get.getClass === classOf[YarnSparkHadoopUtil])
+      System.setProperty("SPARK_YARN_MODE", "false")
+      assert(SparkHadoopUtil.get.getClass === classOf[SparkHadoopUtil])
+    } finally {
+      System.clearProperty("SPARK_YARN_MODE")
+    }
+  }
 }