From 9526e7cd3c864a05f0916e59fa45f840bc823c71 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 2 Nov 2015 16:11:08 -0800
Subject: [PATCH 1/3] add Python API for stddev/variance

---
 python/pyspark/sql/functions.py |  15 +++++
 python/pyspark/sql/group.py     | 104 ++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fa04f4cd83b6f..046b681f1d53a 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -122,6 +122,21 @@ def _():
     'bitwiseNOT': 'Computes bitwise not.',
 }
 
+_functions_1_6 = {
+    # unary math functions
+    "stddev": "Aggregate function: returns the unbiased sample standard deviation of" +
+              " the expression in a group.",
+    "stddev_samp": "Aggregate function: returns the unbiased sample standard deviation of" +
+              " the expression in a group.",
+    "stddev_pop": "Aggregate function: returns population standard deviation of" +
+              " the expression in a group.",
+    "variance": "Aggregate function: returns the population variance of the values in a group.",
+    "var_samp": "Aggregate function: returns the unbiased variance of the values in a group.",
+    "var_pop": "Aggregate function: returns the population variance of the values in a group.",
+    "skewness": "Aggregate function: returns the skewness of the values in a group.",
+    "kurtosis": "Aggregate function: returns the kurtosis of the values in a group."
+}
+
 # math functions that take two arguments as input
 _binary_mathfunctions = {
     'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 71c0bccc5eeff..a3084ccf7ec2f 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,6 +167,110 @@ def sum(self, *cols):
         [Row(sum(age)=7, sum(height)=165)]
         """
 
+    @df_varargs_api
+    @since(1.6)
+    def stddev(self, *cols):
+        """Compute the sample standard deviation for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the stddev for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().stddev('age', 'height').collect()
+        [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def stddev_samp(self, *cols):
+        """Compute the sample standard deviation for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the stddev for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().stddev_samp('age', 'height').collect()
+        [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def stddev_pop(self, *cols):
+        """Compute the population standard deviation for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the stddev for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().stddev_pop('age', 'height').collect()
+        [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def variance(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the variance for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().variance('age', 'height').collect()
+        [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def var_pop(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the variance for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().var_pop('age', 'height').collect()
+        [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def var_samp(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the variance for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().var_samp('age', 'height').collect()
+        [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def skewness(self, *cols):
+        """Compute the skewness for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the skewness values for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().skewness('age', 'height').collect()
+        [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def kurtosis(self, *cols):
+        """Compute the kurtosis for each numeric columns for each group.
+        The resulting [[DataFrame]] will also contain the grouping columns.
+        When specified columns are given, only compute the kurtosis values for them.
+
+        :param cols: list of column names (string). Non-numeric columns are ignored.
+
+        >>> df3.groupBy().kurtosis('age', 'height').collect()
+        [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
+        """
+
 
 def _test():
     import doctest

From 673b245febe8165b1bff9ee511a865e8532dd64e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 2 Nov 2015 23:16:54 -0800
Subject: [PATCH 2/3] cleanup

---
 python/pyspark/sql/functions.py               |  4 +-
 .../org/apache/spark/sql/functions.scala      | 67 -------------------
 2 files changed, 3 insertions(+), 68 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 046b681f1d53a..2f7c2f4aacd47 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -132,7 +132,7 @@ def _():
               " the expression in a group.",
     "variance": "Aggregate function: returns the population variance of the values in a group.",
     "var_samp": "Aggregate function: returns the unbiased variance of the values in a group.",
-    "var_pop": "Aggregate function: returns the population variance of the values in a group.",
+    "var_pop":  "Aggregate function: returns the population variance of the values in a group.",
     "skewness": "Aggregate function: returns the skewness of the values in a group.",
     "kurtosis": "Aggregate function: returns the kurtosis of the values in a group."
 }
@@ -187,6 +187,8 @@ def _():
     globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
 for _name, _doc in _window_functions.items():
     globals()[_name] = since(1.4)(_create_window_function(_name, _doc))
+for _name, _doc in _functions_1_6.items():
+    globals()[_name] = since(1.6)(_create_function(_name, _doc))
 del _name, _doc
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c1737b1ef663c..c5bfba3b243ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -236,14 +236,6 @@ object functions {
    */
   def kurtosis(e: Column): Column = Kurtosis(e.expr)
 
-  /**
-   * Aggregate function: returns the kurtosis of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
-
   /**
    * Aggregate function: returns the last value in a group.
    *
@@ -318,14 +310,6 @@ object functions {
    */
   def skewness(e: Column): Column = Skewness(e.expr)
 
-  /**
-   * Aggregate function: returns the skewness of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def skewness(columnName: String): Column = skewness(Column(columnName))
-
   /**
    * Aggregate function: returns the unbiased sample standard deviation of
    * the expression in a group.
@@ -335,15 +319,6 @@ object functions {
    */
   def stddev(e: Column): Column = Stddev(e.expr)
 
-  /**
-   * Aggregate function: returns the unbiased sample standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev(columnName: String): Column = stddev(Column(columnName))
-
   /**
    * Aggregate function: returns the unbiased sample standard deviation of
    * the expression in a group.
@@ -353,15 +328,6 @@ object functions {
    */
   def stddev_samp(e: Column): Column = StddevSamp(e.expr)
 
-  /**
-   * Aggregate function: returns the unbiased sample standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName))
-
   /**
    * Aggregate function: returns the population standard deviation of
    * the expression in a group.
@@ -371,15 +337,6 @@ object functions {
    */
   def stddev_pop(e: Column): Column = StddevPop(e.expr)
 
-  /**
-   * Aggregate function: returns the population standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName))
-
   /**
    * Aggregate function: returns the sum of all values in the expression.
    *
@@ -420,14 +377,6 @@ object functions {
    */
   def variance(e: Column): Column = Variance(e.expr)
 
-  /**
-   * Aggregate function: returns the population variance of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def variance(columnName: String): Column = variance(Column(columnName))
-
   /**
    * Aggregate function: returns the unbiased variance of the values in a group.
    *
@@ -436,14 +385,6 @@ object functions {
    */
   def var_samp(e: Column): Column = VarianceSamp(e.expr)
 
-  /**
-   * Aggregate function: returns the unbiased variance of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def var_samp(columnName: String): Column = var_samp(Column(columnName))
-
   /**
    * Aggregate function: returns the population variance of the values in a group.
    *
@@ -452,14 +393,6 @@ object functions {
    */
   def var_pop(e: Column): Column = VariancePop(e.expr)
 
-  /**
-   * Aggregate function: returns the population variance of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def var_pop(columnName: String): Column = var_pop(Column(columnName))
-
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Window functions
   //////////////////////////////////////////////////////////////////////////////////////////////

From df7e6d3e495b0ed77be6b490b31a45889b9c3efd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 3 Nov 2015 07:45:30 -0800
Subject: [PATCH 3/3] simplify docs

---
 python/pyspark/sql/group.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index a3084ccf7ec2f..946b53e71c2c6 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -171,8 +171,6 @@ def sum(self, *cols):
     @since(1.6)
     def stddev(self, *cols):
         """Compute the sample standard deviation for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the stddev for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -184,8 +182,6 @@ def stddev(self, *cols):
     @since(1.6)
     def stddev_samp(self, *cols):
         """Compute the sample standard deviation for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the stddev for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -197,8 +193,6 @@ def stddev_samp(self, *cols):
     @since(1.6)
     def stddev_pop(self, *cols):
         """Compute the population standard deviation for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the stddev for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -210,8 +204,6 @@ def stddev_pop(self, *cols):
     @since(1.6)
     def variance(self, *cols):
         """Compute the sample variance for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the variance for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -223,8 +215,6 @@ def variance(self, *cols):
     @since(1.6)
     def var_pop(self, *cols):
         """Compute the sample variance for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the variance for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -236,8 +226,6 @@ def var_pop(self, *cols):
     @since(1.6)
     def var_samp(self, *cols):
         """Compute the sample variance for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the variance for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -249,8 +237,6 @@ def var_samp(self, *cols):
     @since(1.6)
     def skewness(self, *cols):
         """Compute the skewness for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the skewness values for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.
 
@@ -262,8 +248,6 @@ def skewness(self, *cols):
     @since(1.6)
     def kurtosis(self, *cols):
         """Compute the kurtosis for each numeric columns for each group.
-        The resulting [[DataFrame]] will also contain the grouping columns.
-        When specified columns are given, only compute the kurtosis values for them.
 
         :param cols: list of column names (string). Non-numeric columns are ignored.