From 43f11e68376560facd578ebb964832887b9d836c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 22 May 2015 18:00:33 -0700
Subject: [PATCH] [SPARK-7322][SQL] Improve DataFrame window function
 documentation.

---
 .../org/apache/spark/sql/functions.scala      | 202 +++++++++---------
 .../sql/hive/HiveDataFrameWindowSuite.scala   |  20 +-
 2 files changed, 104 insertions(+), 118 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 8775be724e0f9..371a4683b2e24 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -326,168 +326,135 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Window function: returns the lag value of current row of the expression,
-   * null when the current row extends before the beginning of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `null` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lag(columnName: String): Column = {
-    lag(columnName, 1)
-  }
-
-  /**
-   * Window function: returns the lag value of current row of the column,
-   * null when the current row extends before the beginning of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lag(e: Column): Column = {
-    lag(e, 1)
+  def lag(e: Column, offset: Int): Column = {
+    lag(e, offset, null)
   }
 
   /**
-   * Window function: returns the lag values of current row of the expression,
-   * null when the current row extends before the beginning of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `null` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lag(e: Column, count: Int): Column = {
-    lag(e, count, null)
-  }
-
-  /**
-   * Window function: returns the lag values of current row of the column,
-   * null when the current row extends before the beginning of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lag(columnName: String, count: Int): Column = {
-    lag(columnName, count, null)
+  def lag(columnName: String, offset: Int): Column = {
+    lag(columnName, offset, null)
   }
 
   /**
-   * Window function: returns the lag values of current row of the column,
-   * given default value when the current row extends before the beginning
-   * of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `defaultValue` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lag(columnName: String, count: Int, defaultValue: Any): Column = {
-    lag(Column(columnName), count, defaultValue)
-  }
-
-  /**
-   * Window function: returns the lag values of current row of the expression,
-   * given default value when the current row extends before the beginning
-   * of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lag(e: Column, count: Int, defaultValue: Any): Column = {
-    UnresolvedWindowFunction("lag", e.expr :: Literal(count) :: Literal(defaultValue) :: Nil)
+  def lag(columnName: String, offset: Int, defaultValue: Any): Column = {
+    lag(Column(columnName), offset, defaultValue)
   }
 
   /**
-   * Window function: returns the lead value of current row of the column,
-   * null when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows before the current row, and
+   * `defaultValue` if there is less than `offset` rows before the current row. For example,
+   * an `offset` of one will return the previous row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lead(columnName: String): Column = {
-    lead(columnName, 1)
-  }
-
-  /**
-   * Window function: returns the lead value of current row of the expression,
-   * null when the current row extends before the end of the window.
+   * This is equivalent to the LAG function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(e: Column): Column = {
-    lead(e, 1)
+  def lag(e: Column, offset: Int, defaultValue: Any): Column = {
+    UnresolvedWindowFunction("lag", e.expr :: Literal(offset) :: Literal(defaultValue) :: Nil)
   }
 
   /**
-   * Window function: returns the lead values of current row of the column,
-   * null when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `null` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
    *
-   * @group window_funcs
-   * @since 1.4.0
-   */
-  def lead(columnName: String, count: Int): Column = {
-    lead(columnName, count, null)
-  }
-
-  /**
-   * Window function: returns the lead values of current row of the expression,
-   * null when the current row extends before the end of the window.
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(e: Column, count: Int): Column = {
-    lead(e, count, null)
+  def lead(columnName: String, offset: Int): Column = {
+    lead(columnName, offset, null)
   }
 
   /**
-   * Window function: returns the lead values of current row of the column,
-   * given default value when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `null` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
+   *
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(columnName: String, count: Int, defaultValue: Any): Column = {
-    lead(Column(columnName), count, defaultValue)
+  def lead(e: Column, offset: Int): Column = {
+    lead(e, offset, null)
   }
 
   /**
-   * Window function: returns the lead values of current row of the expression,
-   * given default value when the current row extends before the end of the window.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `defaultValue` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
+   *
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def lead(e: Column, count: Int, defaultValue: Any): Column = {
-    UnresolvedWindowFunction("lead", e.expr :: Literal(count) :: Literal(defaultValue) :: Nil)
+  def lead(columnName: String, offset: Int, defaultValue: Any): Column = {
+    lead(Column(columnName), offset, defaultValue)
   }
 
   /**
-   * NTILE for specified expression.
-   * NTILE allows easy calculation of tertiles, quartiles, deciles and other
-   * common summary statistics. This function divides an ordered partition into a specified
-   * number of groups called buckets and assigns a bucket number to each row in the partition.
+   * Window function: returns the value that is `offset` rows after the current row, and
+   * `defaultValue` if there is less than `offset` rows after the current row. For example,
+   * an `offset` of one will return the next row at any given point in the window partition.
+   *
+   * This is equivalent to the LEAD function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def ntile(e: Column): Column = {
-    UnresolvedWindowFunction("ntile", e.expr :: Nil)
+  def lead(e: Column, offset: Int, defaultValue: Any): Column = {
+    UnresolvedWindowFunction("lead", e.expr :: Literal(offset) :: Literal(defaultValue) :: Nil)
   }
 
   /**
-   * NTILE for specified column.
-   * NTILE allows easy calculation of tertiles, quartiles, deciles and other
-   * common summary statistics. This function divides an ordered partition into a specified
-   * number of groups called buckets and assigns a bucket number to each row in the partition.
+   * Window function: returns a group id from 1 to `n` (inclusive) in a round-robin fashion in
+   * a window partition. Fow example, if `n` is 3, the first row will get 1, the second row will
+   * get 2, the third row will get 3, and the fourth row will get 1...
+   *
+   * This is equivalent to the NTILE function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
    */
-  def ntile(columnName: String): Column = {
-    ntile(Column(columnName))
+  def ntile(n: Int): Column = {
+    UnresolvedWindowFunction("ntile", lit(n).expr :: Nil)
   }
 
   /**
-   * Assigns a unique number (sequentially, starting from 1, as defined by ORDER BY) to each
-   * row within the partition.
+   * Window function: returns a sequential number starting at 1 within a window partition.
+   *
+   * This is equivalent to the ROW_NUMBER function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
@@ -497,11 +464,15 @@ object functions {
   }
 
   /**
-   * The difference between RANK and DENSE_RANK is that DENSE_RANK leaves no gaps in ranking
-   * sequence when there are ties. That is, if you were ranking a competition using DENSE_RANK
+   * Window function: returns the rank of rows within a window partition, without any gaps.
+   *
+   * The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using denseRank
    * and had three people tie for second place, you would say that all three were in second
    * place and that the next person came in third.
    *
+   * This is equivalent to the DENSE_RANK function in SQL.
+   *
    * @group window_funcs
    * @since 1.4.0
    */
@@ -510,11 +481,15 @@ object functions {
   }
 
   /**
-   * The difference between RANK and DENSE_RANK is that DENSE_RANK leaves no gaps in ranking
-   * sequence when there are ties. That is, if you were ranking a competition using DENSE_RANK
+   * Window function: returns the rank of rows within a window partition.
+   *
+   * The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using denseRank
    * and had three people tie for second place, you would say that all three were in second
    * place and that the next person came in third.
    *
+   * This is equivalent to the RANK function in SQL.
+   *
    * @group window_funcs
    * @since 1.4.0
    */
@@ -523,10 +498,21 @@ object functions {
   }
 
   /**
-   * CUME_DIST (defined as the inverse of percentile in some statistical books) computes
-   * the position of a specified value relative to a set of values.
-   * To compute the CUME_DIST of a value x in a set S of size N, you use the formula:
-   * CUME_DIST(x) = number of values in S coming before and including x in the specified order / N
+   * Window function: returns the cumulative distribution of values within a window partition,
+   * i.e. the fraction of rows that are below the current row.
+   *
+   * {{{
+   *   N = total number of rows in the partition
+   *   cumeDist(x) = number of values before (and including) x / N
+   * }}}
+   *
+   *
+   * This is computed by:
+   * {{{
+   *   (rank of row in its partition - 1) / (number of rows in the partition - 1)
+   * }}}
+   *
+   * This is equivalent to the CUME_DIST function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
@@ -536,10 +522,14 @@ object functions {
   }
 
   /**
-   * PERCENT_RANK is similar to CUME_DIST, but it uses rank values rather than row counts
-   * in its numerator.
-   * The formula:
-   * (rank of row in its partition - 1) / (number of rows in the partition - 1)
+   * Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
+   *
+   * This is computed by:
+   * {{{
+   *   (rank of row in its partition - 1) / (number of rows in the partition - 1)
+   * }}}
+   *
+   * This is equivalent to the PERCENT_RANK function in SQL.
    *
    * @group window_funcs
    * @since 1.4.0
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
index 6cea6776c8ca6..efb3f2545db84 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -31,8 +31,8 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lead("key").over(w),
-        lead("value").over(w)),
+        lead("key", 1).over(w),
+        lead("value", 1).over(w)),
       Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
   }
 
@@ -42,8 +42,8 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lead("key").over(w),
-        lead("value").over(w)),
+        lead("key", 1).over(w),
+        lead("value", 1).over(w)),
       Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
   }
 
@@ -53,7 +53,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lead("value").over(Window.partitionBy($"key").orderBy($"value"))),
+        lead("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
       sql(
         """SELECT
           | lead(value) OVER (PARTITION BY key ORDER BY value)
@@ -66,9 +66,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
 
     checkAnswer(
       df.select(
-        lag("value").over(
-          Window.partitionBy($"key")
-          .orderBy($"value"))),
+        lag("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
       sql(
         """SELECT
           | lag(value) OVER (PARTITION BY key ORDER BY value)
@@ -112,8 +110,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
         mean("key").over(Window.partitionBy("value").orderBy("key")),
         count("key").over(Window.partitionBy("value").orderBy("key")),
         sum("key").over(Window.partitionBy("value").orderBy("key")),
-        ntile("key").over(Window.partitionBy("value").orderBy("key")),
-        ntile($"key").over(Window.partitionBy("value").orderBy("key")),
+        ntile(2).over(Window.partitionBy("value").orderBy("key")),
         rowNumber().over(Window.partitionBy("value").orderBy("key")),
         denseRank().over(Window.partitionBy("value").orderBy("key")),
         rank().over(Window.partitionBy("value").orderBy("key")),
@@ -127,8 +124,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
            |avg(key) over (partition by value order by key),
            |count(key) over (partition by value order by key),
            |sum(key) over (partition by value order by key),
-           |ntile(key) over (partition by value order by key),
-           |ntile(key) over (partition by value order by key),
+           |ntile(2) over (partition by value order by key),
            |row_number() over (partition by value order by key),
            |dense_rank() over (partition by value order by key),
            |rank() over (partition by value order by key),