diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 719eca8f5559e..274a7105a6ff7 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -252,17 +252,46 @@ def corr(col1, col2): """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. - >>> a = [x * x - 2 * x + 3.5 for x in range(20)] - >>> b = range(20) - >>> corrDf = sqlContext.createDataFrame(zip(a, b)) - >>> corrDf = corrDf.agg(corr(corrDf._1, corrDf._2).alias('c')) - >>> corrDf.selectExpr('abs(c - 0.9572339139475857) < 1e-16 as t').collect() - [Row(t=True)] + >>> a = range(20) + >>> b = [2 * x for x in range(20)] + >>> df = sqlContext.createDataFrame(zip(a, b), ["a", "b"]) + >>> df.agg(corr("a", "b").alias('c')).collect() + [Row(c=1.0)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2))) +@since(2.0) +def covar_pop(col1, col2): + """Returns a new :class:`Column` for the population covariance of ``col1`` + and ``col2``. + + >>> a = [1] * 10 + >>> b = [1] * 10 + >>> df = sqlContext.createDataFrame(zip(a, b), ["a", "b"]) + >>> df.agg(covar_pop("a", "b").alias('c')).collect() + [Row(c=0.0)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.covar_pop(_to_java_column(col1), _to_java_column(col2))) + + +@since(2.0) +def covar_samp(col1, col2): + """Returns a new :class:`Column` for the sample covariance of ``col1`` + and ``col2``. + + >>> a = [1] * 10 + >>> b = [1] * 10 + >>> df = sqlContext.createDataFrame(zip(a, b), ["a", "b"]) + >>> df.agg(covar_samp("a", "b").alias('c')).collect() + [Row(c=0.0)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.covar_samp(_to_java_column(col1), _to_java_column(col2))) + + @since(1.3) def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.