In [1]:
# find [why] for questions
# why I get error with eval_type = read_int(infile)

import pyspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Functions 3.5.0").getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/01 20:36:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
path = "datasets/"
students = spark.read.csv(path+"students.csv", inferSchema=True, header=True)
tour = spark.read.csv(path+"pga_tour_historical.csv", inferSchema=True, header=True)
tour.limit(100).dropna().dropDuplicates().limit(5).show()
students.limit(5).toPandas()

                                                                                

+---------------+------+----------------+--------------------+-----+
|    Player Name|Season|       Statistic|            Variable|Value|
+---------------+------+----------------+--------------------+-----+
|Robert Garrigus|  2010|Driving Distance|Driving Distance ...|   71|
|   Bubba Watson|  2010|Driving Distance|Driving Distance ...|   77|
| Dustin Johnson|  2010|Driving Distance|Driving Distance ...|   83|
|Brett Wetterich|  2010|Driving Distance|Driving Distance ...|   54|
|    J.B. Holmes|  2010|Driving Distance|Driving Distance ...|  100|
+---------------+------+----------------+--------------------+-----+



Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


23/11/01 20:37:09 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


# Functions

## [Normal Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#normal-functions)

#### pyspark.sql.functions.col(col: str) → pyspark.sql.column.Column
Returns a Column based on the given column name.

In [159]:
col("math score")

Column<'math score'>

#### pyspark.sql.functions.column(col: str) → pyspark.sql.column.Column¶
Returns a Column based on the given column name.

In [160]:
column("math score")

Column<'math score'>

#### pyspark.sql.functions.lit(col: Any) → pyspark.sql.column.Column¶
Creates a Column of literal value.

In [161]:
df = spark.range(1)
df.select(lit(5).alias('height'), df.id).show()

+------+---+
|height| id|
+------+---+
|     5|  0|
+------+---+



In [162]:
spark.range(1).select(lit([1, 2, 3])).show()

+--------------+
|array(1, 2, 3)|
+--------------+
|     [1, 2, 3]|
+--------------+



#### pyspark.sql.functions.broadcast(df: pyspark.sql.dataframe.DataFrame) → pyspark.sql.dataframe.DataFrame
Marks a DataFrame as small enough for use in broadcast joins.

In [185]:
df = spark.createDataFrame([1, 2, 3, 3, 4], IntegerType())
df_small = spark.range(3)
df_b = broadcast(df_small)
df.join(df_b, df.value == df_small.id).show()

+-----+---+
|value| id|
+-----+---+
|    1|  1|
|    2|  2|
+-----+---+



#### pyspark.sql.functions.coalesce(*cols: ColumnOrName) → pyspark.sql.column.Column
Returns the first column that is not null.

In [164]:
cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
cDf.show()

+----+----+
|   a|   b|
+----+----+
|NULL|NULL|
|   1|NULL|
|NULL|   2|
+----+----+



In [165]:
cDf.select(coalesce(cDf["a"], cDf["b"])).show()

+--------------+
|coalesce(a, b)|
+--------------+
|          NULL|
|             1|
|             2|
+--------------+



In [166]:
cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()

+----+----+----------------+
|   a|   b|coalesce(a, 0.0)|
+----+----+----------------+
|NULL|NULL|             0.0|
|   1|NULL|             1.0|
|NULL|   2|             0.0|
+----+----+----------------+



#### pyspark.sql.functions.input_file_name() → pyspark.sql.column.Column
Creates a string column for the file name of the current Spark task.

In [184]:
"""
import os
path = os.path.abspath(__file__)
df = spark.read.text(path)
df.select(input_file_name()).first()
"""

'\nimport os\npath = os.path.abspath(__file__)\ndf = spark.read.text(path)\ndf.select(input_file_name()).first()\n'

#### pyspark.sql.functions.isnan(col: ColumnOrName) → pyspark.sql.column.Column¶
An expression that returns true if the column is NaN.

In [None]:
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.select("a", "b", isnan("a").alias("r1"), isnan(df.b).alias("r2")).show()

#### pyspark.sql.functions.isnull(col: ColumnOrName) → pyspark.sql.column.Column¶
An expression that returns true if the column is null.

In [168]:
df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))
df.select("a", "b", isnull("a").alias("r1"), isnull(df.b).alias("r2")).show()

+----+----+-----+-----+
|   a|   b|   r1|   r2|
+----+----+-----+-----+
|   1|NULL|false| true|
|NULL|   2| true|false|
+----+----+-----+-----+



#### pyspark.sql.functions.monotonically_increasing_id() → pyspark.sql.column.Column
A column that generates monotonically increasing 64-bit integers.

The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. The current implementation puts the partition ID in the upper 31 bits, and the record number within each partition in the lower 33 bits. The assumption is that the data frame has less than 1 billion partitions, and each partition has less than 8 billion records.

##### Notes
    The function is non-deterministic because its result depends on partition IDs.

In [169]:
spark.range(0, 10, 1, 2).select(monotonically_increasing_id()).show()

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
|                            2|
|                            3|
|                            4|
|                   8589934592|
|                   8589934593|
|                   8589934594|
|                   8589934595|
|                   8589934596|
+-----------------------------+



#### pyspark.sql.functions.named_struct(*cols: ColumnOrName) → pyspark.sql.column.Column
Creates a struct with the given field names and values.

In [170]:
df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c'])
df.select(named_struct(lit('x'), df.a, lit('y'), df.b).alias('r')).collect()

[Row(r=Row(x=1, y=2))]

#### pyspark.sql.functions.nanvl(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns col1 if it is not NaN, or col2 if col1 is NaN.

Both inputs should be floating point columns (DoubleType or FloatType).

In [171]:
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect()

[Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]

#### pyspark.sql.functions.rand(seed: Optional[int] = None) → pyspark.sql.column.Column
Generates a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0).

##### Notes
    The function is non-deterministic in general case.

In [172]:
spark.range(0, 2, 1, 1).withColumn('rand', rand(seed=42) * 3).show()

+---+------------------+
| id|              rand|
+---+------------------+
|  0|1.8575681106759028|
|  1|1.5288056527339444|
+---+------------------+



#### pyspark.sql.functions.randn(seed: Optional[int] = None) → pyspark.sql.column.Column
Generates a column with independent and identically distributed (i.i.d.) samples from the standard normal distribution.

##### Notes
    The function is non-deterministic in general case.

In [173]:
spark.range(0, 2, 1, 1).withColumn('randn', randn(seed=42)).show()

+---+------------------+
| id|             randn|
+---+------------------+
|  0| 2.384479054241165|
|  1|0.1920934041293524|
+---+------------------+



#### pyspark.sql.functions.spark_partition_id() → pyspark.sql.column.Column
A column for partition ID.

##### Notes
    This is non deterministic because it depends on data partitioning and task scheduling.

In [174]:
df = spark.range(2)
df.repartition(1).select(spark_partition_id().alias("pid")).collect()

[Row(pid=0), Row(pid=0)]

#### pyspark.sql.functions.when(condition: pyspark.sql.column.Column, value: Any) → pyspark.sql.column.Column
Evaluates a list of conditions and returns one of multiple possible result expressions. If pyspark.sql.Column.otherwise() is not invoked, None is returned for unmatched conditions.

In [175]:
df = spark.range(3)
df.select(when(df['id'] == 2, 3).otherwise(4).alias("age")).show()

+---+
|age|
+---+
|  4|
|  4|
|  3|
+---+



In [176]:
df.select(when(df.id == 2, df.id + 1).alias("age")).show()

+----+
| age|
+----+
|NULL|
|NULL|
|   3|
+----+



#### pyspark.sql.functions.bitwise_not(col: ColumnOrName) → pyspark.sql.column.Column¶
Computes bitwise not.

In [177]:
df = spark.range(1)
df.select(bitwise_not(lit(0))).show()

+---+
| ~0|
+---+
| -1|
+---+



In [178]:
df.select(bitwise_not(lit(1))).show()

+---+
| ~1|
+---+
| -2|
+---+



#### pyspark.sql.functions.bitwiseNOT(col: ColumnOrName) → pyspark.sql.column.Column
Computes bitwise not.
Deprecated since version 3.2.0: Use bitwise_not() instead.

#### pyspark.sql.functions.expr(str: str) → pyspark.sql.column.Column
Parses the expression string into the column that it represents

In [179]:
df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"])
df.select("name", expr("length(name)")).show()

+-----+------------+
| name|length(name)|
+-----+------------+
|Alice|           5|
|  Bob|           3|
+-----+------------+



#### pyspark.sql.functions.greatest(*cols: ColumnOrName) → pyspark.sql.column.Column¶
Returns the greatest value of the list of column names, skipping null values. This function takes at least 2 parameters. It will return null if all parameters are null.

In [180]:
df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect()

[Row(greatest=4)]

#### pyspark.sql.functions.least(*cols: ColumnOrName) → pyspark.sql.column.Column
Returns the least value of the list of column names, skipping null values. This function takes at least 2 parameters. It will return null if all parameters are null.

In [181]:
df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
df.select(least(df.a, df.b, df.c).alias("least")).collect()

[Row(least=1)]

## Math Functions

## [Datetime Functions]()

## [Collection Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#collection-functions)

#### pyspark.sql.functions.array(*cols: Union[ColumnOrName, List[ColumnOrName_], Tuple[ColumnOrName_, …]]) → pyspark.sql.column.Column
Creates a new array column.

In [41]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
df.select(array('age', 'age').alias("arr")).collect()

[Row(arr=[2, 2]), Row(arr=[5, 5])]

In [42]:
df.select(array([df.age, df.age]).alias("arr")).collect()

[Row(arr=[2, 2]), Row(arr=[5, 5])]

In [43]:
df.select(array('age', 'age').alias("col")).printSchema()

root
 |-- col: array (nullable = false)
 |    |-- element: long (containsNull = true)



#### pyspark.sql.functions.array_contains(col: ColumnOrName, value: Any) → pyspark.sql.column.Column
Collection function: returns null if the array is null, true if the array contains the given value, and false otherwise.

In [44]:
df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
df.select(array_contains(df.data, "a")).collect()

[Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]

In [45]:
df.select(array_contains(df.data, lit("a"))).collect()

[Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]

#### pyspark.sql.functions.arrays_overlap(a1: ColumnOrName, a2: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns true if the arrays contain any common non-null element; if not, returns null if both the arrays are non-empty and any of them contains a null element; returns false otherwise.

In [46]:
df = spark.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ['x', 'y'])
df.select(arrays_overlap(df.x, df.y).alias("overlap")).collect()

[Row(overlap=True), Row(overlap=False)]

#### pyspark.sql.functions.array_join(col: ColumnOrName, delimiter: str, null_replacement: Optional[str] = None) → pyspark.sql.column.Column
Concatenates the elements of column using the delimiter. Null values are replaced with null_replacement if set, otherwise they are ignored.

##### Parameters:
- col: Column or str | 
target column to work on.
- delimiter: str | 
delimiter used to concatenate elements
- null_replacement: str, optional | 
if set then null values will be replaced by this value

Returns – Column | 
a column of string type. Concatenated values.

In [47]:
df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data'])
df.select(array_join(df.data, ",").alias("joined")).collect()

[Row(joined='a,b,c'), Row(joined='a')]

In [48]:
df.select(array_join(df.data, ",", "NULL").alias("joined")).collect()

[Row(joined='a,b,c'), Row(joined='a,NULL')]

#### pyspark.sql.functions.create_map(*cols: Union[ColumnOrName, List[ColumnOrName_], Tuple[ColumnOrName_, …]]) → pyspark.sql.column.Column
Creates a new map column.

In [49]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
df.select(create_map('name', 'age').alias("map")).collect()

[Row(map={'Alice': 2}), Row(map={'Bob': 5})]

In [50]:
df.select(create_map([df.name, df.age]).alias("map")).collect()

[Row(map={'Alice': 2}), Row(map={'Bob': 5})]

#### pyspark.sql.functions.slice(x: ColumnOrName, start: Union[ColumnOrName, int], length: Union[ColumnOrName, int]) → pyspark.sql.column.Column
Collection function: returns an array containing all the elements in x from index start (array indices start at 1, or from the end if start is negative) with the specified length.

Parameters:
- x: Column or str  | 
column name or column containing the array to be sliced
- start: Column or str or int | 
column name, column, or int containing the starting index
- length: Column or str or int | 
column name, column, or int containing the length of the slice

Returns – Column | 
a column of array type. Subset of array.

In [51]:
df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x'])
df.select(slice(df.x, 2, 2).alias("sliced")).collect()

[Row(sliced=[2, 3]), Row(sliced=[5])]

#### pyspark.sql.functions.concat(*cols: ColumnOrName) → pyspark.sql.column.Column
Concatenates multiple input columns together into a single column. The function works with strings, numeric, binary and compatible array columns.

See also – pyspark.sql.functions.array_join() | 
to concatenate string columns with delimiter

In [52]:
df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
df = df.select(concat(df.s, df.d).alias('s'))
df.collect()

[Row(s='abcd123')]

In [53]:
df

DataFrame[s: string]

In [54]:
df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
df = df.select(concat(df.a, df.b, df.c).alias("arr"))
df.collect()

[Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]

In [55]:
df

DataFrame[arr: array<bigint>]

#### pyspark.sql.functions.array_position(col: ColumnOrName, value: Any) → pyspark.sql.column.Column
Collection function: Locates the position of the first occurrence of the given value in the given array. Returns null if either of the arguments are null.

##### Notes
    The position is not zero based, but 1 based index. Returns 0 if the given value could not be found in the array.

In [56]:
df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data'])
df.select(array_position(df.data, "a")).collect()

[Row(array_position(data, a)=3), Row(array_position(data, a)=0)]

#### pyspark.sql.functions.element_at(col: ColumnOrName, extraction: Any) → pyspark.sql.column.Column
Collection function: Returns element of array at given index in extraction if col is array. Returns value for the given key in extraction if col is map. If position is negative then location of the element will start from end, if number is outside the array boundaries then None will be returned.

##### Notes
    The position is not zero based, but 1 based index.

In [57]:
df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
df.select(element_at(df.data, 1)).collect()

[Row(element_at(data, 1)='a')]

In [58]:
df.select(element_at(df.data, -1)).collect()

[Row(element_at(data, -1)='c')]

In [59]:
df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
df.select(element_at(df.data, lit("a"))).collect()

[Row(element_at(data, a)=1.0)]

#### pyspark.sql.functions.array_append(col: ColumnOrName, value: Any) → pyspark.sql.column.Column
Collection function: returns an array of the elements in col1 along with the added element in col2 at the last of the array.

##### Notes
    Supports Spark Connect.

In [60]:
df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")])
df.select(array_append(df.c1, df.c2)).collect()

[Row(array_append(c1, c2)=['b', 'a', 'c', 'c'])]

In [61]:
df.select(array_append(df.c1, 'x')).collect()

[Row(array_append(c1, x)=['b', 'a', 'c', 'x'])]

#### pyspark.sql.functions.array_size(col: ColumnOrName) → pyspark.sql.column.Column
Returns the total number of elements in the array. The function returns null for null input.

In [62]:
df = spark.createDataFrame([([2, 1, 3],), (None,)], ['data'])
df.select(array_size(df.data).alias('r')).collect()

[Row(r=3), Row(r=None)]

#### pyspark.sql.functions.array_sort(col: ColumnOrName, comparator: Optional[Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]] = None) → pyspark.sql.column.Column
Collection function: sorts the input array in ascending order. The elements of the input array must be orderable. Null elements will be placed at the end of the returned array.

##### Parameters:
- col: Column or str | 
name of column or expression
- comparator: callable, optional | 
A binary (Column, Column) -> Column: .... The comparator will take two arguments representing two elements of the array. It returns a negative integer, 0, or a positive integer as the first element is less than, equal to, or greater than the second element. If the comparator function returns null, the function will fail and raise an error.

Returns – Column | 
sorted array.

In [63]:
df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data'])
df.select(array_sort(df.data).alias('r')).collect()

[Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])]

In [64]:
df = spark.createDataFrame([(["foo", "foobar", None, "bar"],),(["foo"],),([],)], ['data'])
df.select(array_sort(
    "data",
    lambda x, y: when(x.isNull() | y.isNull(), lit(0)).otherwise(length(y) - length(x))
).alias("r")).collect()

[Row(r=['foobar', 'foo', None, 'bar']), Row(r=['foo']), Row(r=[])]

#### pyspark.sql.functions.array_insert(arr: ColumnOrName, pos: Union[ColumnOrName, int], value: Any) → pyspark.sql.column.Column
Collection function: adds an item into a given array at a specified array index. Array indices start at 1, or start from the end if index is negative. Index above array size appends the array, or prepends the array if index is negative, with ‘null’ elements.

##### Parameters:
- arr: Column or str | 
name of column containing an array
- pos: Column or str or int | 
name of Numeric type column indicating position of insertion (starting at index 1, negative position is a start from the back of the array)
- value :
a literal value, or a Column expression.

Returns – Column | 
an array of values, including the new specified value

##### Notes
    Supports Spark Connect.

In [65]:
df = spark.createDataFrame(
    [(['a', 'b', 'c'], 2, 'd'), (['c', 'b', 'a'], -2, 'd')],
    ['data', 'pos', 'val']
)
df.select(array_insert(df.data, df.pos.cast('integer'), df.val).alias('data')).collect()

[Row(data=['a', 'd', 'b', 'c']), Row(data=['c', 'b', 'd', 'a'])]

In [66]:
df.select(array_insert(df.data, 5, 'hello').alias('data')).collect()

[Row(data=['a', 'b', 'c', None, 'hello']),
 Row(data=['c', 'b', 'a', None, 'hello'])]

#### pyspark.sql.functions.array_remove(col: ColumnOrName, element: Any) → pyspark.sql.column.Column
Collection function: Remove all elements that equal to element from the given array.

In [67]:
df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ['data'])
df.select(array_remove(df.data, 1)).collect()

[Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])]

#### pyspark.sql.functions.array_prepend(col: ColumnOrName, value: Any) → pyspark.sql.column.Column
Collection function: Returns an array containing element as well as all elements from array. The new element is positioned at the beginning of the array.

In [68]:
df = spark.createDataFrame([([2, 3, 4],), ([],)], ['data'])
df.select(array_prepend(df.data, 1)).collect()

[Row(array_prepend(data, 1)=[1, 2, 3, 4]), Row(array_prepend(data, 1)=[1])]

#### pyspark.sql.functions.array_distinct(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: removes duplicate values from the array.

In [69]:
df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data'])
df.select(array_distinct(df.data)).collect()

[Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])]

#### pyspark.sql.functions.array_intersect(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column¶
Collection function: returns an array of the elements in the intersection of col1 and col2, without duplicates.

In [70]:
df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
df.select(array_intersect(df.c1, df.c2)).collect()

[Row(array_intersect(c1, c2)=['a', 'c'])]

#### pyspark.sql.functions.array_union(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column¶
Collection function: returns an array of the elements in the union of col1 and col2, without duplicates.

In [71]:
df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
df.select(array_union(df.c1, df.c2)).collect()

[Row(array_union(c1, c2)=['b', 'a', 'c', 'd', 'f'])]

#### pyspark.sql.functions.array_except(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns an array of the elements in col1 but not in col2, without duplicates.

In [72]:
df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
df.select(array_except(df.c1, df.c2)).collect()

[Row(array_except(c1, c2)=['b'])]

#### pyspark.sql.functions.array_compact(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: removes null values from the array.

##### Notes
    Supports Spark Connect.

In [73]:
df = spark.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], ['data'])
df.select(array_compact(df.data)).collect()

[Row(array_compact(data)=[1, 2, 3]), Row(array_compact(data)=[4, 5, 4])]

#### pyspark.sql.functions.transform(col: ColumnOrName, f: Union[Callable[[pyspark.sql.column.Column], pyspark.sql.column.Column], Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]]) → pyspark.sql.column.Column
Returns an array of elements after applying a transformation to each element in the input array.

##### Parameters:
- col: Column or str | 
name of column or expression
- f: function | 
a function that is applied to each element of the input array. Can take one of the following forms:
- * Unary (x: Column) -> Column: ...
- * Binary (x: Column, i: Column) -> Column..., where the second argument is
a 0-based index of the element.
and can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
a new array of transformed elements.

In [74]:
df = spark.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values"))
df.select(transform("values", lambda x: x * 2).alias("doubled")).show()

+------------+
|     doubled|
+------------+
|[2, 4, 6, 8]|
+------------+



In [75]:
def alternate(x, i):
    return when(i % 2 == 0, x).otherwise(-x)

df.select(transform("values", alternate).alias("alternated")).show()

+--------------+
|    alternated|
+--------------+
|[1, -2, 3, -4]|
+--------------+



#### pyspark.sql.functions.exists(col: ColumnOrName, f: Callable[[pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column
Returns whether a predicate holds for one or more elements in the array.

##### Parameters:
- col: Column or str | 
name of column or expression
- f: function | 
(x: Column) -> Column: ... returning the Boolean expression. Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
True if “any” element of an array evaluates to True when passed as an argument to given function and False otherwise.

In [76]:
df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],("key", "values"))
df.select(exists("values", lambda x: x < 0).alias("any_negative")).show()

+------------+
|any_negative|
+------------+
|       false|
|        true|
+------------+



#### pyspark.sql.functions.forall(col: ColumnOrName, f: Callable[[pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column
Returns whether a predicate holds for every element in the array.

##### Parameters:
- col: Column or str | 
name of column or expression
- f: function | 
(x: Column) -> Column: ... returning the Boolean expression. Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
True if “all” elements of an array evaluates to True when passed as an argument to given function and False otherwise.

In [77]:
df = spark.createDataFrame(
    [(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])],
    ("key", "values")
)
df.select(forall("values", lambda x: x.rlike("foo")).alias("all_foo")).show()

+-------+
|all_foo|
+-------+
|  false|
|  false|
|   true|
+-------+



#### pyspark.sql.functions.filter(col: ColumnOrName, f: Union[Callable[[pyspark.sql.column.Column], pyspark.sql.column.Column], Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]]) → pyspark.sql.column.Column
Returns an array of elements for which a predicate holds in a given array.

##### Parameters:
- col: Column or str | 
name of column or expression
- f: function | 
A function that returns the Boolean expression. Can take one of the following forms:
- * Unary (x: Column) -> Column: ...
- * Binary (x: Column, i: Column) -> Column..., where the second argument is
a 0-based index of the element.
and can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
filtered array of elements where given function evaluated to True when passed as an argument.

In [78]:
df = spark.createDataFrame(
    [(1, ["2018-09-20",  "2019-02-03", "2019-07-01", "2020-06-01"])],
    ("key", "values")
)
def after_second_quarter(x):
    return month(to_date(x)) > 6

df.select(
    filter("values", after_second_quarter).alias("after_second_quarter")
).show(truncate=False)

+------------------------+
|after_second_quarter    |
+------------------------+
|[2018-09-20, 2019-07-01]|
+------------------------+



#### pyspark.sql.functions.aggregate(col: ColumnOrName, initialValue: ColumnOrName, merge: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column], finish: Optional[Callable[[pyspark.sql.column.Column], pyspark.sql.column.Column]] = None) → pyspark.sql.column.Column
Applies a binary operator to an initial state and all elements in the array, and reduces this to a single state. The final state is converted into the final result by applying a finish function.

Both functions can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

##### Parameters:
- col: Column or str | 
name of column or expression
- initial: ValueColumn or str | 
initial value. Name of column or expression
- merge: function | 
a binary function (acc: Column, x: Column) -> Column... returning expression of the same type as zero
- finish: function | 
an optional unary function (x: Column) -> Column: ... used to convert accumulated value.

Returns – Column | 
final value after aggregate function is applied.

In [79]:
df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values"))
df.select(aggregate("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show()

+----+
| sum|
+----+
|42.0|
+----+



In [80]:
def merge(acc, x):
    count = acc.count + 1
    sum = acc.sum + x
    return struct(count.alias("count"), sum.alias("sum"))

df.select(
    aggregate(
        "values",
        struct(lit(0).alias("count"), lit(0.0).alias("sum")),
        merge,
        lambda acc: acc.sum / acc.count,
    ).alias("mean")
).show()

+----+
|mean|
+----+
| 8.4|
+----+



#### pyspark.sql.functions.zip_with(left: ColumnOrName, right: ColumnOrName, f: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column
Merge two given arrays, element-wise, into a single array using a function. If one array is shorter, nulls are appended at the end to match the length of the longer array, before applying the function.

Parameters: 
- left: Column or str | 
name of the first column or expression
- right: Column or str | 
name of the second column or expression
- f: function | 
a binary function (x1: Column, x2: Column) -> Column... Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
array of calculated values derived by applying given function to each pair of arguments.

In [81]:
df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], ("id", "xs", "ys"))
df.select(zip_with("xs", "ys", lambda x, y: x ** y).alias("powers")).show(truncate=False)

+---------------------------+
|powers                     |
+---------------------------+
|[1.0, 9.0, 625.0, 262144.0]|
+---------------------------+



In [82]:
df = spark.createDataFrame([(1, ["foo", "bar"], [1, 2, 3])], ("id", "xs", "ys"))
df.select(zip_with("xs", "ys", lambda x, y: concat_ws("_", x, y)).alias("xs_ys")).show()

+-----------------+
|            xs_ys|
+-----------------+
|[foo_1, bar_2, 3]|
+-----------------+



#### pyspark.sql.functions.transform_keys(col: ColumnOrName, f: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column
Applies a function to every key-value pair in a map and returns a map with the results of those applications as the new keys for the pairs.

##### Parameters:
- col: Column or str | 
name of column or expression
- f: function | 
a binary function (k: Column, v: Column) -> Column... Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
a new map of enties where new keys were calculated by applying given function to each key value argument.

In [83]:
df = spark.createDataFrame([(1, {"foo": -2.0, "bar": 2.0})], ("id", "data"))
row = df.select(transform_keys(
    "data", lambda k, _: upper(k)).alias("data_upper")
).head()
sorted(row["data_upper"].items())

[('BAR', 2.0), ('FOO', -2.0)]

#### pyspark.sql.functions.transform_values(col: ColumnOrName, f: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column¶
Applies a function to every key-value pair in a map and returns a map with the results of those applications as the new values for the pairs.

##### Parameters: 
- col: Column or str | 
name of column or expression
- f: function | 
a binary function (k: Column, v: Column) -> Column... Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
a new map of enties where new values were calculated by applying given function to each key value argument.

In [84]:
df = spark.createDataFrame([(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data"))
row = df.select(transform_values(
    "data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v)
).alias("new_data")).head()
sorted(row["new_data"].items())

[('IT', 20.0), ('OPS', 34.0), ('SALES', 2.0)]

#### pyspark.sql.functions.map_filter(col: ColumnOrName, f: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column
Returns a map whose key-value pairs satisfy a predicate.

##### Parameters: 
- col: Column or str | 
name of column or expression
- f: function | 
a binary function (k: Column, v: Column) -> Column... Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns– Column | 
filtered map.

In [85]:
df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data"))
row = df.select(map_filter(
    "data", lambda _, v: v > 30.0).alias("data_filtered")
).head()
sorted(row["data_filtered"].items())

[('baz', 32.0), ('foo', 42.0)]

#### pyspark.sql.functions.map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Creates a new map from two arrays.

In [86]:
df = spark.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v'])
df = df.select(map_from_arrays(df.k, df.v).alias("col"))
df.show()

+----------------+
|             col|
+----------------+
|{2 -> a, 5 -> b}|
+----------------+



In [87]:
df.printSchema()

root
 |-- col: map (nullable = true)
 |    |-- key: long
 |    |-- value: string (valueContainsNull = true)



#### pyspark.sql.functions.map_zip_with(col1: ColumnOrName, col2: ColumnOrName, f: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column]) → pyspark.sql.column.Column
Merge two given maps, key-wise into a single map using a function.

##### Parameters:
- col1: Column or str | 
name of the first column or expression
- col2: Column or str | 
name of the second column or expression
- f: function | 
a ternary function (k: Column, v1: Column, v2: Column) -> Column... Can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

Returns – Column | 
zipped map where entries are calculated by applying given function to each pair of arguments.

In [88]:
df = spark.createDataFrame([
    (1, {"IT": 24.0, "SALES": 12.00}, {"IT": 2.0, "SALES": 1.4})],
    ("id", "base", "ratio")
)
row = df.select(map_zip_with(
    "base", "ratio", lambda k, v1, v2: round(v1 * v2, 2)).alias("updated_data")
).head()
sorted(row["updated_data"].items())

[('IT', 48.0), ('SALES', 16.8)]

#### pyspark.sql.functions.explode(col: ColumnOrName) → pyspark.sql.column.Column
Returns a new row for each element in the given array or map. Uses the default column name col for elements in the array and key and value for elements in the map unless specified otherwise.

In [89]:
df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
df.select(explode(df.intlist).alias("anInt")).collect()

[Row(anInt=1), Row(anInt=2), Row(anInt=3)]

In [90]:
df.select(explode(df.mapfield).alias("key", "value")).show()

+---+-----+
|key|value|
+---+-----+
|  a|    b|
+---+-----+



#### pyspark.sql.functions.explode_outer(col: ColumnOrName) → pyspark.sql.column.Column
Returns a new row for each element in the given array or map. Unlike explode, if the array/map is null or empty then null is produced. Uses the default column name col for elements in the array and key and value for elements in the map unless specified otherwise.

In [91]:
df = spark.createDataFrame(
    [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],
    ("id", "an_array", "a_map")
)
df.select("id", "an_array", explode_outer("a_map")).show()

+---+----------+----+-----+
| id|  an_array| key|value|
+---+----------+----+-----+
|  1|[foo, bar]|   x|  1.0|
|  2|        []|NULL| NULL|
|  3|      NULL|NULL| NULL|
+---+----------+----+-----+



In [92]:
df.select("id", "a_map", explode_outer("an_array")).show()

+---+----------+----+
| id|     a_map| col|
+---+----------+----+
|  1|{x -> 1.0}| foo|
|  1|{x -> 1.0}| bar|
|  2|        {}|NULL|
|  3|      NULL|NULL|
+---+----------+----+



#### pyspark.sql.functions.posexplode(col: ColumnOrName) → pyspark.sql.column.Column
Returns a new row for each element with position in the given array or map. Uses the default column name pos for position, and col for elements in the array and key and value for elements in the map unless specified otherwise.

In [93]:
df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
df.select(posexplode(df.intlist)).collect()

[Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)]

In [94]:
df.select(posexplode(df.mapfield)).show()

+---+---+-----+
|pos|key|value|
+---+---+-----+
|  0|  a|    b|
+---+---+-----+



#### pyspark.sql.functions.posexplode_outer(col: ColumnOrName) → pyspark.sql.column.Column
Returns a new row for each element with position in the given array or map. Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced. Uses the default column name pos for position, and col for elements in the array and key and value for elements in the map unless specified otherwise.

In [95]:
df = spark.createDataFrame(
    [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],
    ("id", "an_array", "a_map")
)
df.select("id", "an_array", posexplode_outer("a_map")).show()

+---+----------+----+----+-----+
| id|  an_array| pos| key|value|
+---+----------+----+----+-----+
|  1|[foo, bar]|   0|   x|  1.0|
|  2|        []|NULL|NULL| NULL|
|  3|      NULL|NULL|NULL| NULL|
+---+----------+----+----+-----+



In [96]:
df.select("id", "a_map", posexplode_outer("an_array")).show()

+---+----------+----+----+
| id|     a_map| pos| col|
+---+----------+----+----+
|  1|{x -> 1.0}|   0| foo|
|  1|{x -> 1.0}|   1| bar|
|  2|        {}|NULL|NULL|
|  3|      NULL|NULL|NULL|
+---+----------+----+----+



#### pyspark.sql.functions.inline(col: ColumnOrName) → pyspark.sql.column.Column
Explodes an array of structs into a table.

##### Notes
    Supports Spark Connect.

In [97]:
df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])])
df.select(inline(df.structlist)).show()

+---+---+
|  a|  b|
+---+---+
|  1|  2|
|  3|  4|
+---+---+



#### pyspark.sql.functions.inline_outer(col: ColumnOrName) → pyspark.sql.column.Column
Explodes an array of structs into a table. Unlike inline, if the array is null or empty then null is produced for each nested column.

##### Notes
    Supports Spark Connect.

In [98]:
df = spark.createDataFrame([
    Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]),
    Row(id=2, structlist=[])
])
df.select('id', inline_outer(df.structlist)).show()

+---+----+----+
| id|   a|   b|
+---+----+----+
|  1|   1|   2|
|  1|   3|   4|
|  2|NULL|NULL|
+---+----+----+



#### pyspark.sql.functions.get(col: ColumnOrName, index: Union[ColumnOrName, int]) → pyspark.sql.column.Column
Collection function: Returns element of array at given (0-based) index. If the index points outside of the array boundaries, then this function returns NULL.

##### Notes
    The position is not 1 based, but 0 based index. Supports Spark Connect.

In [99]:
df = spark.createDataFrame([(["a", "b", "c"], 1)], ['data', 'index'])
df.select(get(df.data, 1)).show()

+------------+
|get(data, 1)|
+------------+
|           b|
+------------+



In [100]:
df.select(get(df.data, -1)).show()

+-------------+
|get(data, -1)|
+-------------+
|         NULL|
+-------------+



In [101]:
df.select(get(df.data, 3)).show()

+------------+
|get(data, 3)|
+------------+
|        NULL|
+------------+



In [102]:
df.select(get(df.data, "index")).show()

+----------------+
|get(data, index)|
+----------------+
|               b|
+----------------+



In [103]:
df.select(get(df.data, col("index") - 1)).show()

+----------------------+
|get(data, (index - 1))|
+----------------------+
|                     a|
+----------------------+



#### pyspark.sql.functions.get_json_object(col: ColumnOrName, path: str) → pyspark.sql.column.Column
Extracts json object from a json string based on json path specified, and returns json string of the extracted json object. It will return null if the input json string is invalid.

In [104]:
data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
df = spark.createDataFrame(data, ("key", "jstring"))
df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \
                  get_json_object(df.jstring, '$.f2').alias("c1") ).collect()

[Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]

#### pyspark.sql.functions.json_tuple(col: ColumnOrName, *fields: str) → pyspark.sql.column.Column
Creates a new row for a json column according to the given field names.

##### Parameters: 
- col: Column or str | 
string column in json format
- fields: str | 
a field or fields to extract

Returns – Column | 
a new row for each given field value from json object

In [105]:
ata = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
df = spark.createDataFrame(data, ("key", "jstring"))
df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()

[Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]

#### pyspark.sql.functions.from_json(col: ColumnOrName, schema: Union[pyspark.sql.types.ArrayType, pyspark.sql.types.StructType, pyspark.sql.column.Column, str], options: Optional[Dict[str, str]] = None) → pyspark.sql.column.Column
Parses a column containing a JSON string into a MapType with StringType as keys type, StructType or ArrayType with the specified schema. Returns null, in the case of an unparseable string.

##### Parameters: 
- col: Column or str | 
a column or column name in JSON format
- schema: DataType or str | 
a StructType, ArrayType of StructType or Python string literal with a DDL-formatted string to use when parsing the json column
- options: dict, optional | 
options to control parsing. accepts the same options as the json datasource. See Data Source Option for the version you use.

Returns – Column | 
a new column of complex type from given JSON object.

In [106]:
data = [(1, '''{"a": 1}''')]
schema = StructType([StructField("a", IntegerType())])
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=Row(a=1))]

In [107]:
df.select(from_json(df.value, "a INT").alias("json")).collect()

[Row(json=Row(a=1))]

In [108]:
df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()

[Row(json={'a': 1})]

In [109]:
data = [(1, '''[{"a": 1}]''')]
schema = ArrayType(StructType([StructField("a", IntegerType())]))
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=[Row(a=1)])]

In [110]:
schema = schema_of_json(lit('''{"a": 0}'''))
df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=Row(a=None))]

In [111]:
data = [(1, '''[1, 2, 3]''')]
schema = ArrayType(IntegerType())
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=[1, 2, 3])]

#### pyspark.sql.functions.schema_of_json(json: ColumnOrName, options: Optional[Dict[str, str]] = None) → pyspark.sql.column.Column
Parses a JSON string and infers its schema in DDL format.

##### Parameters: 
- json: Column or str | 
a JSON string or a foldable string column containing a JSON string.
- optionsdict, optional | 
options to control parsing. accepts the same options as the JSON datasource. See Data Source Option for the version you use. (Changed in version 3.0.0: It accepts options parameter to control schema inferring.)

Returns – Column | 
a string representation of a StructType parsed from given JSON.

In [112]:
df = spark.range(1)
df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect()

[Row(json='STRUCT<a: BIGINT>')]

In [113]:
schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'})
df.select(schema.alias("json")).collect()

[Row(json='STRUCT<a: BIGINT>')]

#### pyspark.sql.functions.to_json(col: ColumnOrName, options: Optional[Dict[str, str]] = None) → pyspark.sql.column.Column
Converts a column containing a StructType, ArrayType or a MapType into a JSON string. Throws an exception, in the case of an unsupported type.

##### Parameters: 
- col: Column or str | 
name of column containing a struct, an array or a map.
- options: dict, optional | 
options to control converting. accepts the same options as the JSON datasource. See Data Source Option for the version you use. Additionally the function supports the pretty option which enables pretty JSON generation.

Returns – Column | 
JSON object as string column.

In [114]:
data = [(1, Row(age=2, name='Alice'))]
df = spark.createDataFrame(data, ("key", "value"))
df.select(to_json(df.value).alias("json")).collect()

[Row(json='{"age":2,"name":"Alice"}')]

In [115]:
data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])]
df = spark.createDataFrame(data, ("key", "value"))
df.select(to_json(df.value).alias("json")).collect()

[Row(json='[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]

In [116]:
data = [(1, {"name": "Alice"})]
df = spark.createDataFrame(data, ("key", "value"))
df.select(to_json(df.value).alias("json")).collect()

[Row(json='{"name":"Alice"}')]

In [117]:
data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
df = spark.createDataFrame(data, ("key", "value"))
df.select(to_json(df.value).alias("json")).collect()

[Row(json='[{"name":"Alice"},{"name":"Bob"}]')]

In [118]:
data = [(1, ["Alice", "Bob"])]
df = spark.createDataFrame(data, ("key", "value"))
df.select(to_json(df.value).alias("json")).collect()

[Row(json='["Alice","Bob"]')]

#### pyspark.sql.functions.json_array_length(col: ColumnOrName) → pyspark.sql.column.Column
Returns the number of elements in the outermost JSON array. NULL is returned in case of any other valid JSON string, NULL or an invalid JSON.

In [119]:
df = spark.createDataFrame([(None,), ('[1, 2, 3]',), ('[]',)], ['data'])
df.select(json_array_length(df.data).alias('r')).collect()

[Row(r=None), Row(r=3), Row(r=0)]

#### pyspark.sql.functions.json_object_keys(col: ColumnOrName) → pyspark.sql.column.Column
Returns all the keys of the outermost JSON object as an array. If a valid JSON object is given, all the keys of the outermost object will be returned as an array. If it is any other valid JSON string, an invalid JSON string or an empty string, the function returns null.

In [120]:
df = spark.createDataFrame([(None,), ('{}',), ('{"key1":1, "key2":2}',)], ['data'])
df.select(json_object_keys(df.data).alias('r')).collect()

[Row(r=None), Row(r=[]), Row(r=['key1', 'key2'])]

#### pyspark.sql.functions.size(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns the length of the array or map stored in the column.

In [121]:
df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
df.select(size(df.data)).collect()

[Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]

#### pyspark.sql.functions.cardinality(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns the length of the array or map stored in the column.

In [122]:
spark.createDataFrame(
    [([1, 2, 3],),([1],),([],)], ['data']
).select(cardinality("data")).show()

+-----------------+
|cardinality(data)|
+-----------------+
|                3|
|                1|
|                0|
+-----------------+



#### pyspark.sql.functions.struct(*cols: Union[ColumnOrName, List[ColumnOrName_], Tuple[ColumnOrName_, …]]) → pyspark.sql.column.Column
Creates a new struct column.

In [123]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
df.select(struct('age', 'name').alias("struct")).collect()

[Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]

In [124]:
df.select(struct([df.age, df.name]).alias("struct")).collect()

[Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]

#### pyspark.sql.functions.sort_array(col: ColumnOrName, asc: bool = True) → pyspark.sql.column.Column
Collection function: sorts the input array in ascending or descending order according to the natural ordering of the array elements. Null elements will be placed at the beginning of the returned array in ascending order or at the end of the returned array in descending order.

##### Parameters: 
- col: Column or str | 
name of column or expression
- asc: bool, optional | 
whether to sort in ascending or descending order. If asc is True (default) then ascending and if False then descending.

Returns – Column | 
sorted array.

In [125]:
df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data'])
df.select(sort_array(df.data).alias('r')).collect()

[Row(r=[None, 1, 2, 3]), Row(r=[1]), Row(r=[])]

In [126]:
df.select(sort_array(df.data, asc=False).alias('r')).collect()

[Row(r=[3, 2, 1, None]), Row(r=[1]), Row(r=[])]

#### pyspark.sql.functions.array_max(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns the maximum value of the array.

In [127]:
df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])
df.select(array_max(df.data).alias('max')).collect()

[Row(max=3), Row(max=10)]

#### pyspark.sql.functions.array_min(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns the minimum value of the array.

In [128]:
df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])
df.select(array_min(df.data).alias('min')).collect()

[Row(min=1), Row(min=-1)]

#### pyspark.sql.functions.shuffle(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: Generates a random permutation of the given array.

##### Notes
    The function is non-deterministic.

In [129]:
df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], ['data'])
df.select(shuffle(df.data).alias('s')).collect()  

[Row(s=[20, 1, 3, 5]), Row(s=[None, 3, 20, 1])]

#### pyspark.sql.functions.reverse(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: returns a reversed string or an array with reverse order of elements.

In [130]:
df = spark.createDataFrame([('Spark SQL',)], ['data'])
df.select(reverse(df.data).alias('s')).collect()

[Row(s='LQS krapS')]

In [131]:
df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data'])
df.select(reverse(df.data).alias('r')).collect()

[Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]

#### pyspark.sql.functions.flatten(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: creates a single array from an array of arrays. If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.

In [132]:
df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ['data'])
df.show(truncate=False)

+------------------------+
|data                    |
+------------------------+
|[[1, 2, 3], [4, 5], [6]]|
|[NULL, [4, 5]]          |
+------------------------+



In [133]:
df.select(flatten(df.data).alias('r')).show()

+------------------+
|                 r|
+------------------+
|[1, 2, 3, 4, 5, 6]|
|              NULL|
+------------------+



#### pyspark.sql.functions.sequence(start: ColumnOrName, stop: ColumnOrName, step: Optional[ColumnOrName] = None) → pyspark.sql.column.Column[source]
Generate a sequence of integers from start to stop, incrementing by step. If step is not set, incrementing by 1 if start is less than or equal to stop, otherwise -1.

##### Parameters:
- start: Column or str | 
starting value (inclusive)
- stop: Column or str | 
last values (inclusive)
- step: Column or str, optional | 
value to add to current to get next element (default is 1)

Returns – Column | 
an array of sequence values

In [134]:
df1 = spark.createDataFrame([(-2, 2)], ('C1', 'C2'))
df1.select(sequence('C1', 'C2').alias('r')).collect()

[Row(r=[-2, -1, 0, 1, 2])]

In [135]:
df2 = spark.createDataFrame([(4, -4, -2)], ('C1', 'C2', 'C3'))
df2.select(sequence('C1', 'C2', 'C3').alias('r')).collect()

[Row(r=[4, 2, 0, -2, -4])]

#### pyspark.sql.functions.array_repeat(col: ColumnOrName, count: Union[ColumnOrName, int]) → pyspark.sql.column.Column
Collection function: creates an array containing a column repeated count times.

In [136]:
df = spark.createDataFrame([('ab',)], ['data'])
df.select(array_repeat(df.data, 3).alias('r')).collect()

[Row(r=['ab', 'ab', 'ab'])]

#### pyspark.sql.functions.map_contains_key(col: ColumnOrName, value: Any) → pyspark.sql.column.Column
Returns true if the map contains the key.

In [137]:
df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
df.select(map_contains_key("data", 1)).show()

+---------------------------------+
|array_contains(map_keys(data), 1)|
+---------------------------------+
|                             true|
+---------------------------------+



In [138]:
df.select(map_contains_key("data", -1)).show()

+----------------------------------+
|array_contains(map_keys(data), -1)|
+----------------------------------+
|                             false|
+----------------------------------+



#### pyspark.sql.functions.map_keys(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: Returns an unordered array containing the keys of the map.

In [139]:
df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
df.select(map_keys("data").alias("keys")).show()

+------+
|  keys|
+------+
|[1, 2]|
+------+



#### pyspark.sql.functions.map_values(col: ColumnOrName) → pyspark.sql.column.Column¶
Collection function: Returns an unordered array containing the values of the map.

In [140]:
df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
df.select(map_values("data").alias("values")).show()

+------+
|values|
+------+
|[a, b]|
+------+



#### pyspark.sql.functions.map_entries(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: Returns an unordered array of all entries in the given map.

In [141]:
df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
df = df.select(map_entries("data").alias("entries"))
df.show()

+----------------+
|         entries|
+----------------+
|[{1, a}, {2, b}]|
+----------------+



In [142]:
df.printSchema()

root
 |-- entries: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- key: integer (nullable = false)
 |    |    |-- value: string (nullable = false)



#### pyspark.sql.functions.map_from_entries(col: ColumnOrName) → pyspark.sql.column.Column
Collection function: Converts an array of entries (key value struct types) to a map of values.

In [143]:
df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data")
df.select(map_from_entries("data").alias("map")).show()

+----------------+
|             map|
+----------------+
|{1 -> a, 2 -> b}|
+----------------+



#### pyspark.sql.functions.arrays_zip(*cols: ColumnOrName) → pyspark.sql.column.Column¶
Collection function: Returns a merged array of structs in which the N-th struct contains all N-th values of input arrays. If one of the arrays is shorter than others then resulting struct type value will be a null for missing elements.

In [144]:
df = spark.createDataFrame([([1, 2, 3], [2, 4, 6], [3, 6])], ['vals1', 'vals2', 'vals3'])
df = df.select(arrays_zip(df.vals1, df.vals2, df.vals3).alias('zipped'))
df.show(truncate=False)

+------------------------------------+
|zipped                              |
+------------------------------------+
|[{1, 2, 3}, {2, 4, 6}, {3, 6, NULL}]|
+------------------------------------+



In [145]:
df.printSchema()

root
 |-- zipped: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- vals1: long (nullable = true)
 |    |    |-- vals2: long (nullable = true)
 |    |    |-- vals3: long (nullable = true)



#### pyspark.sql.functions.map_concat(*cols: Union[ColumnOrName, List[ColumnOrName_], Tuple[ColumnOrName_, …]]) → pyspark.sql.column.Column
Returns the union of all the given maps.

In [146]:
df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2")
df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)

+------------------------+
|map3                    |
+------------------------+
|{1 -> a, 2 -> b, 3 -> c}|
+------------------------+



#### pyspark.sql.functions.from_csv(col: ColumnOrName, schema: Union[pyspark.sql.column.Column, str], options: Optional[Dict[str, str]] = None) → pyspark.sql.column.Column
Parses a column containing a CSV string to a row with the specified schema. Returns null, in the case of an unparseable string.

##### Parameters:
- col: Column or str | 
a column or column name in CSV format
- schema: class:`~pyspark.sql.Column` or str | 
a column, or Python string literal with schema in DDL format, to use when parsing the CSV column.
- options: dict, optional | 
options to control parsing. accepts the same options as the CSV datasource. See Data Source Option for the version you use.

Returns– Column | 
a column of parsed CSV values

In [147]:
data = [("1,2,3",)]
df = spark.createDataFrame(data, ("value",))
df.select(from_csv(df.value, "a INT, b INT, c INT").alias("csv")).collect()

[Row(csv=Row(a=1, b=2, c=3))]

In [148]:
value = data[0][0]
df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect()

[Row(csv=Row(_c0=1, _c1=2, _c2=3))]

In [149]:
data = [("   abc",)]
df = spark.createDataFrame(data, ("value",))
options = {'ignoreLeadingWhiteSpace': True}
df.select(from_csv(df.value, "s string", options).alias("csv")).collect()

[Row(csv=Row(s='abc'))]

#### pyspark.sql.functions.schema_of_csv(csv: ColumnOrName, options: Optional[Dict[str, str]] = None) → pyspark.sql.column.Column
Parses a CSV string and infers its schema in DDL format.

In [150]:
df = spark.range(1)
df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()

[Row(csv='STRUCT<_c0: INT, _c1: STRING>')]

In [151]:
df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()

[Row(csv='STRUCT<_c0: INT, _c1: STRING>')]

#### pyspark.sql.functions.str_to_map(text: ColumnOrName, pairDelim: Optional[ColumnOrName] = None, keyValueDelim: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Creates a map after splitting the text into key/value pairs using delimiters. Both pairDelim and keyValueDelim are treated as regular expressions.

##### Parameters:
- text: Column or str | 
Input column or strings.
- pairDelim: Column or str, optional | 
delimiter to use to split pair.
- keyValueDelim: Column or str, optional | 
delimiter to use to split key/value.

In [152]:
df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
df.select(str_to_map(df.e, lit(","), lit(":")).alias('r')).collect()

[Row(r={'a': '1', 'b': '2', 'c': '3'})]

In [153]:
df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
df.select(str_to_map(df.e, lit(",")).alias('r')).collect()

[Row(r={'a': '1', 'b': '2', 'c': '3'})]

In [154]:
df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])
df.select(str_to_map(df.e).alias('r')).collect()

[Row(r={'a': '1', 'b': '2', 'c': '3'})]

#### pyspark.sql.functions.to_csv(col: ColumnOrName, options: Optional[Dict[str, str]] = None) → pyspark.sql.column.Column¶
Converts a column containing a StructType into a CSV string. Throws an exception, in the case of an unsupported type.

##### Parameters:
- col: Column or str | 
name of column containing a struct.
- options: dict, optional | 
options to control converting. accepts the same options as the CSV datasource. See Data Source Option for the version you use.

Returns – Column | 
a CSV string converted from given StructType.

In [155]:
data = [(1, Row(age=2, name='Alice'))]
df = spark.createDataFrame(data, ("key", "value"))
df.select(to_csv(df.value).alias("csv")).collect()

[Row(csv='2,Alice')]

#### pyspark.sql.functions.try_element_at(col: ColumnOrName, extraction: ColumnOrName) → pyspark.sql.column.Column
(array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will throw an error. If index < 0, accesses elements from the last to the first. The function always returns NULL if the index exceeds the length of the array.

(map, key) - Returns value for given key. The function always returns NULL if the key is not contained in the map.

In [156]:
df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
df.select(try_element_at(df.data, lit(1)).alias('r')).collect()

[Row(r='a')]

In [157]:
df.select(try_element_at(df.data, lit(-1)).alias('r')).collect()

[Row(r='c')]

In [158]:
df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
df.select(try_element_at(df.data, lit("a")).alias('r')).collect()

[Row(r=1.0)]

## [Partition Transformation Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#partition-transformation-functions)

##### Notes

This function can be used only in combination with partitionedBy() method of the DataFrameWriterV2.

Get AnalysisException: [REQUIRES_SINGLE_PART_NAMESPACE] spark_catalog requires a single-part namespace, but got `catalog`.`db`.

#### pyspark.sql.functions.years(col: ColumnOrName) → pyspark.sql.column.Column
Partition transform function: A transform for timestamps and dates to partition data into years.

In [75]:
"""df.writeTo("catalog.db.table").partitionedBy(  
    years("ts")
).createOrReplace()
"""

'df.writeTo("catalog.db.table").partitionedBy(  \n    years("ts")\n).createOrReplace()\n'

#### pyspark.sql.functions.months(col: ColumnOrName) → pyspark.sql.column.Column
Partition transform function: A transform for timestamps and dates to partition data into months.

In [78]:
"""df.writeTo("catalog.db.table").partitionedBy(
    months("ts")
).createOrReplace()
"""

'df.writeTo("catalog.db.table").partitionedBy(\n    months("ts")\n).createOrReplace()\n'

#### pyspark.sql.functions.days(col: ColumnOrName) → pyspark.sql.column.Column
Partition transform function: A transform for timestamps and dates to partition data into days.

In [80]:
"""df.writeTo("catalog.db.table").partitionedBy(  
    days("ts")
).createOrReplace()"""

'df.writeTo("catalog.db.table").partitionedBy(  \n    days("ts")\n).createOrReplace()'

#### pyspark.sql.functions.hours(col: ColumnOrName) → pyspark.sql.column.Column
Partition transform function: A transform for timestamps to partition data into hours.

In [83]:
"""df.writeTo("catalog.db.table").partitionedBy(   
    hours("ts")
).createOrReplace()"""

'df.writeTo("catalog.db.table").partitionedBy(   \n    hours("ts")\n).createOrReplace()'

#### pyspark.sql.functions.bucket(numBuckets: Union[pyspark.sql.column.Column, int], col: ColumnOrName) → pyspark.sql.column.Column
Partition transform function: A transform for any type that partitions by a hash of the input column.

##### Notes
    This function can be used only in combination with partitionedBy() method of the DataFrameWriterV2.

In [86]:
""" df.writeTo("catalog.db.table").partitionedBy(  
    bucket(42, "ts")
).createOrReplace()"""

' df.writeTo("catalog.db.table").partitionedBy(  \n    bucket(42, "ts")\n).createOrReplace()'

In [40]:
"""
# Створюємо DataFrame з числовими даними
data = [(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,)]
df = spark.createDataFrame(data, ["value"])
num_buckets = 3
# Використовуємо bucket() для розділення числових значень
# bucketed_df = df.withColumn("bucket", bucket(col="value", numBuckets=3))
# bucketed_df = df.withColumn("bucket", expr("bucket(value, {})".format(num_buckets)))
# Виводимо результат
bucketed_df.show()
"""
True

True

## [Aggregate Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#aggregate-functions)

#### pyspark.sql.functions.any_value(col: ColumnOrName, ignoreNulls: Union[bool, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column
Returns some value of col for a group of rows.

In [69]:
tour.select(any_value("Player name"), any_value("Value")).show()

+----------------------+----------------+
|any_value(Player name)|any_value(Value)|
+----------------------+----------------+
|       Robert Garrigus|              71|
+----------------------+----------------+



                                                                                

In [70]:
df = spark.createDataFrame([(None, 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.select(any_value('c1'), any_value('c2')).collect()

[Row(any_value(c1)=None, any_value(c2)=1)]

In [71]:
df.select(any_value('c1', True), any_value('c2', True)).collect()

[Row(any_value(c1)='a', any_value(c2)=1)]

#### pyspark.sql.functions.approxCountDistinct(col: ColumnOrName, rsd: Optional[float] = None) → pyspark.sql.column.Column
##### Use approx_count_distinct() instead.

#### pyspark.sql.functions.approx_count_distinct(col: ColumnOrName, rsd: Optional[float] = None) → pyspark.sql.column.Column
Aggregate function: returns a new Column for approximate distinct count of column col.

In [3]:
df = spark.createDataFrame([1,2,2,3], "INT")
df.agg(approx_count_distinct("value").alias('distinct_values')).show()

23/10/30 22:03:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 8:>                                                        (0 + 12) / 12]

+---------------+
|distinct_values|
+---------------+
|              3|
+---------------+



                                                                                

#### pyspark.sql.functions.approx_percentile(col: ColumnOrName, percentage: Union[pyspark.sql.column.Column, float, List[float], Tuple[float]], accuracy: Union[pyspark.sql.column.Column, float] = 10000) → pyspark.sql.column.Column
Returns the approximate percentile of the numeric column col which is the smallest value in the ordered col values (sorted from least to greatest) such that no more than percentage of col values is less than the value or equal to that value.

In [4]:
key = (col("id") % 3).alias("key")
value = (randn(42) + key * 10).alias("value")
df = spark.range(0, 1000, 1, 1).select(key, value)
df.select(
    approx_percentile("value", [0.25, 0.5, 0.75], 1000000)
).printSchema()

root
 |-- approx_percentile(value, array(0.25, 0.5, 0.75), 1000000): array (nullable = true)
 |    |-- element: double (containsNull = false)



In [7]:
df.groupBy("key").agg(
    approx_percentile("value", 0.5, lit(1000000))
).printSchema()

root
 |-- key: long (nullable = true)
 |-- approx_percentile(value, 0.5, 1000000): double (nullable = true)



#### pyspark.sql.functions.array_agg(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns a list of objects with duplicates.

In [8]:
df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.agg(array_agg('c').alias('r')).collect()

[Row(r=[1, 1, 2])]

#### pyspark.sql.functions.avg(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the average of the values in a group.

In [9]:
df = spark.range(10)
df.select(avg(col("id"))).show()

+-------+
|avg(id)|
+-------+
|    4.5|
+-------+



In [3]:
students.filter(col("reading score") > 80).groupBy("gender").agg(avg(col("math score")).alias("average")).show()

+------+-----------------+
|gender|          average|
+------+-----------------+
|female|79.16981132075472|
|  male| 87.6842105263158|
+------+-----------------+



#### pyspark.sql.functions.bit_and(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the bitwise AND of all non-null input values, or null if none

In [4]:
df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.select(bit_and("c")).first()

                                                                                

Row(bit_and(c)=0)

#### pyspark.sql.functions.bit_or(col: ColumnOrName) → pyspark.sql.column.Column¶
Aggregate function: returns the bitwise OR of all non-null input values, or null if none.

In [5]:
#### df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.select(bit_or("c")).first()

Row(bit_or(c)=3)

#### pyspark.sql.functions.bit_xor(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the bitwise XOR of all non-null input values, or null if none.

In [6]:
df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.select(bit_xor("c")).first()

Row(bit_xor(c)=2)

#### pyspark.sql.functions.bool_and(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns true if all values of col are true.

In [7]:
df = spark.createDataFrame([[True], [True], [True]], ["flag"])
df.select(bool_and("flag")).show()

+--------------+
|bool_and(flag)|
+--------------+
|          true|
+--------------+



In [8]:
df = spark.createDataFrame([[True], [False], [True]], ["flag"])
df.select(bool_and("flag")).show()

+--------------+
|bool_and(flag)|
+--------------+
|         false|
+--------------+



In [9]:
df = spark.createDataFrame([[False], [False], [False]], ["flag"])
df.select(bool_and("flag")).show()

+--------------+
|bool_and(flag)|
+--------------+
|         false|
+--------------+



#### pyspark.sql.functions.bool_or(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns true if at least one value of col is true.

In [10]:
df = spark.createDataFrame([[True], [True], [True]], ["flag"])
df.select(bool_or("flag")).show()

+-------------+
|bool_or(flag)|
+-------------+
|         true|
+-------------+



In [11]:
df = spark.createDataFrame([[True], [False], [True]], ["flag"])
df.select(bool_or("flag")).show()

+-------------+
|bool_or(flag)|
+-------------+
|         true|
+-------------+



In [12]:
df = spark.createDataFrame([[False], [False], [False]], ["flag"])
df.select(bool_or("flag")).show()

+-------------+
|bool_or(flag)|
+-------------+
|        false|
+-------------+



#### pyspark.sql.functions.collect_list(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns a list of objects with duplicates.


##### Notes:
The function is non-deterministic because the order of collected results depends on the order of the rows which may be non-deterministic after a shuffle.

In [13]:
df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
df2.agg(collect_list('age')).collect()

[Row(collect_list(age)=[2, 5, 5])]

#### pyspark.sql.functions.collect_set(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns a set of objects with duplicate elements eliminated.

##### Notes:
The function is non-deterministic because the order of collected results depends on the order of the rows which may be non-deterministic after a shuffle.

In [14]:
df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
df2.agg(array_sort(collect_set('age')).alias('c')).collect()

[Row(c=[2, 5])]

#### pyspark.sql.functions.corr(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns a new Column for the Pearson Correlation Coefficient for col1 and col2.

In [17]:
a = range(20)
b = [2 * x for x in range(20)]
df = spark.createDataFrame(zip(a, b), ["a", "b"])
df.agg(corr("a", "b").alias('c')).collect()

[Row(c=1.0)]

#### pyspark.sql.functions.count(col: ColumnOrName) → pyspark.sql.column.Column¶
Aggregate function: returns the number of items in a group.

Count by all columns (start), and by a column that does not count None.

In [18]:
df = spark.createDataFrame([(None,), ("a",), ("b",), ("c",)], schema=["alphabets"])
df.select(count(expr("*")), count(df.alphabets)).show()

+--------+----------------+
|count(1)|count(alphabets)|
+--------+----------------+
|       4|               3|
+--------+----------------+



#### pyspark.sql.functions.count_distinct(col: ColumnOrName, *cols: ColumnOrName) → pyspark.sql.column.Column¶
Returns a new Column for distinct count of col or cols.

In [20]:
df1 = spark.createDataFrame([1, 1, 3], IntegerType())
df2 = spark.createDataFrame([1, 2], IntegerType())
df1.join(df2).show()

                                                                                

+-----+-----+
|value|value|
+-----+-----+
|    1|    1|
|    1|    2|
|    1|    1|
|    1|    2|
|    3|    1|
|    3|    2|
+-----+-----+



                                                                                

In [21]:
df1.join(df2).select(count_distinct(df1.value, df2.value)).show()



+----------------------------+
|count(DISTINCT value, value)|
+----------------------------+
|                           4|
+----------------------------+



                                                                                

#### pyspark.sql.functions.countDistinct(col: ColumnOrName, *cols: ColumnOrName) → pyspark.sql.column.Column
Returns a new Column for distinct count of col or cols.

An alias of count_distinct(), and it is encouraged to use count_distinct() directly.

#### pyspark.sql.functions.count_min_sketch(col: ColumnOrName, eps: ColumnOrName, confidence: ColumnOrName, seed: ColumnOrName) → pyspark.sql.column.Column
Returns a count-min sketch of a column with the given esp, confidence and seed. The result is an array of bytes, which can be deserialized to a CountMinSketch before usage. Count-min sketch is a probabilistic data structure used for cardinality estimation using sub-linear space.

In [22]:
df = spark.createDataFrame([[1], [2], [1]], ['data'])
df = df.agg(count_min_sketch(df.data, lit(0.5), lit(0.5), lit(1)).alias('sketch'))
df.select(hex(df.sketch).alias('r')).collect()

[Row(r='0000000100000000000000030000000100000004000000005D8D6AB90000000000000000000000000000000200000000000000010000000000000000')]

#### pyspark.sql.functions.count_if(col: ColumnOrName) → pyspark.sql.column.Column
Returns the number of TRUE values for the col.

In [23]:
df = spark.createDataFrame([("a", 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.select(count_if(col('c2') % 2 == 0)).show()

+------------------------+
|count_if(((c2 % 2) = 0))|
+------------------------+
|                       3|
+------------------------+



#### pyspark.sql.functions.covar_pop(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns a new Column for the population covariance of col1 and col2.

In [24]:
a = [1] * 10
b = [1] * 10
df = spark.createDataFrame(zip(a, b), ["a", "b"])
df.agg(covar_pop("a", "b").alias('c')).collect()

[Row(c=0.0)]

#### pyspark.sql.functions.covar_samp(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns a new Column for the sample covariance of col1 and col2.

In [25]:
a = [1] * 10
b = [1] * 10
df = spark.createDataFrame(zip(a, b), ["a", "b"])
df.agg(covar_samp("a", "b").alias('c')).collect()

[Row(c=0.0)]

#### pyspark.sql.functions.every(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns true if all values of col are true.

In [26]:
spark.createDataFrame(
    [[True], [True], [True]], ["flag"]
).select(every("flag")).show()

+-----------+
|every(flag)|
+-----------+
|       true|
+-----------+



In [27]:
spark.createDataFrame(
    [[True], [False], [True]], ["flag"]
).select(every("flag")).show()

+-----------+
|every(flag)|
+-----------+
|      false|
+-----------+



In [28]:
spark.createDataFrame(
    [[False], [False], [False]], ["flag"]
).select(every("flag")).show()

+-----------+
|every(flag)|
+-----------+
|      false|
+-----------+



#### pyspark.sql.functions.first(col: ColumnOrName, ignorenulls: bool = False) → pyspark.sql.column.Column
Aggregate function: returns the first value in a group.

The function by default returns the first values it sees. It will return the first non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
##### Notes:
The function is non-deterministic because its results depends on the order of the rows which may be non-deterministic after a shuffle.

In [31]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age"))
df = df.orderBy(df.age)
df.groupby("name").agg(first("age")).orderBy("name").show()

+-----+----------+
| name|first(age)|
+-----+----------+
|Alice|      NULL|
|  Bob|         5|
+-----+----------+



In [32]:
df.groupby("name").agg(first("age", ignorenulls=True)).orderBy("name").show()

+-----+----------+
| name|first(age)|
+-----+----------+
|Alice|         2|
|  Bob|         5|
+-----+----------+



#### pyspark.sql.functions.first_value(col: ColumnOrName, ignoreNulls: Union[bool, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column
Returns the first value of col for a group of rows. It will return the first non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned.

In [34]:
spark.createDataFrame(
    [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["a", "b"]
).select(first_value('a'), first_value('b')).show()

+--------------+--------------+
|first_value(a)|first_value(b)|
+--------------+--------------+
|          NULL|             1|
+--------------+--------------+



In [33]:
spark.createDataFrame(
    [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["a", "b"]
).select(first_value('a', True), first_value('b', True)).show()

+--------------+--------------+
|first_value(a)|first_value(b)|
+--------------+--------------+
|             a|             1|
+--------------+--------------+



#### pyspark.sql.functions.grouping(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated or not, returns 1 for aggregated or 0 for not aggregated in the result set.

In [35]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()

+-----+--------------+--------+
| name|grouping(name)|sum(age)|
+-----+--------------+--------+
| NULL|             1|       7|
|Alice|             0|       2|
|  Bob|             0|       5|
+-----+--------------+--------+



#### pyspark.sql.functions.grouping_id(*cols: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the level of grouping, equals to

    (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + … + grouping(cn)

##### Notes

The list of columns should match with grouping columns exactly, or empty (means all the grouping columns).

In [37]:
df = spark.createDataFrame([(1, "a", "a"),
                            (3, "a", "a"),
                            (4, "b", "c")], ["c1", "c2", "c3"])
df.cube("c2", "c3").agg(grouping_id(), sum("c1")).orderBy("c2", "c3").show()

+----+----+-------------+-------+
|  c2|  c3|grouping_id()|sum(c1)|
+----+----+-------------+-------+
|NULL|NULL|            3|      8|
|NULL|   a|            2|      4|
|NULL|   c|            2|      4|
|   a|NULL|            1|      4|
|   a|   a|            0|      4|
|   b|NULL|            1|      4|
|   b|   c|            0|      4|
+----+----+-------------+-------+



#### pyspark.sql.functions.histogram_numeric(col: ColumnOrName, nBins: ColumnOrName) → pyspark.sql.column.Column

Computes a histogram on numeric ‘col’ using nb bins. The return value is an array of (x,y) pairs representing the centers of the histogram’s bins. As the value of ‘nb’ is increased, the histogram approximation gets finer-grained, but may yield artifacts around outliers. In practice, 20-40 histogram bins appear to work well, with more bins being required for skewed or smaller datasets. Note that this function creates a histogram with non-uniform bin widths. It offers no guarantees in terms of the mean-squared-error of the histogram, but in practice is comparable to the histograms produced by the R/S-Plus statistical computing packages. Note: the output type of the ‘x’ field in the return value is propagated from the input value consumed in the aggregate function.

In [39]:
df = spark.createDataFrame([("a", 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.select(histogram_numeric('c2', lit(5))).show()

+------------------------+
|histogram_numeric(c2, 5)|
+------------------------+
|    [{1, 1.0}, {2, 1....|
+------------------------+



#### pyspark.sql.functions.hll_sketch_agg(col: ColumnOrName, lgConfigK: Union[int, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column
Aggregate function: returns the updatable binary representation of the Datasketches HllSketch (HyperLogLog) configured with lgConfigK arg.

In [40]:
df = spark.createDataFrame([1,2,2,3], "INT")
df1 = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))
df1.show()

+------------+
|distinct_cnt|
+------------+
|           3|
+------------+



In [41]:
df2 = df.agg(hll_sketch_estimate(
    hll_sketch_agg("value", lit(12))
).alias("distinct_cnt"))
df2.show()

+------------+
|distinct_cnt|
+------------+
|           3|
+------------+



In [42]:
df3 = df.agg(hll_sketch_estimate(
    hll_sketch_agg(col("value"), lit(12))).alias("distinct_cnt"))
df3.show()

+------------+
|distinct_cnt|
+------------+
|           3|
+------------+



#### pyspark.sql.functions.hll_union_agg(col: ColumnOrName, allowDifferentLgConfigK: Union[bool, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column

Aggregate function: returns the updatable binary representation of the Datasketches HllSketch, generated by merging previously created Datasketches HllSketch instances via a Datasketches Union instance. Throws an exception if sketches have different lgConfigK values and allowDifferentLgConfigK is unset or set to false.

In [43]:
df1 = spark.createDataFrame([1,2,2,3], "INT")
df1 = df1.agg(hll_sketch_agg("value").alias("sketch"))
df2 = spark.createDataFrame([4,5,5,6], "INT")
df2 = df2.agg(hll_sketch_agg("value").alias("sketch"))
df3 = df1.union(df2).agg(hll_sketch_estimate(
    hll_union_agg("sketch")
).alias("distinct_cnt"))
df3.drop("sketch").show()

+------------+
|distinct_cnt|
+------------+
|           6|
+------------+



In [44]:
df4 = df1.union(df2).agg(hll_sketch_estimate(
    hll_union_agg("sketch", lit(False))
).alias("distinct_cnt"))
df4.drop("sketch").show()

+------------+
|distinct_cnt|
+------------+
|           6|
+------------+



In [45]:
df5 = df1.union(df2).agg(hll_sketch_estimate(
    hll_union_agg(col("sketch"), lit(False))
).alias("distinct_cnt"))
df5.drop("sketch").show()

+------------+
|distinct_cnt|
+------------+
|           6|
+------------+



#### pyspark.sql.functions.kurtosis(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the kurtosis of the values in a group.

In [47]:
df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.select(kurtosis(df.c)).show()

+-----------+
|kurtosis(c)|
+-----------+
|       -1.5|
+-----------+



#### pyspark.sql.functions.last(col: ColumnOrName, ignorenulls: bool = False) → pyspark.sql.column.Column
Aggregate function: returns the last value in a group.

The function by default returns the last values it sees. It will return the last non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
##### Notes
The function is non-deterministic because its results depends on the order of the rows which may be non-deterministic after a shuffle.

In [48]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age"))
df = df.orderBy(df.age.desc())
df.groupby("name").agg(last("age")).orderBy("name").show()

+-----+---------+
| name|last(age)|
+-----+---------+
|Alice|     NULL|
|  Bob|        5|
+-----+---------+



In [49]:
df.groupby("name").agg(last("age", ignorenulls=True)).orderBy("name").show()

+-----+---------+
| name|last(age)|
+-----+---------+
|Alice|        2|
|  Bob|        5|
+-----+---------+



#### pyspark.sql.functions.last_value(col: ColumnOrName, ignoreNulls: Union[bool, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column
Returns the last value of col for a group of rows. It will return the last non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned.

In [50]:
spark.createDataFrame(
    [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]
).select(last_value('a'), last_value('b')).show()

+-------------+-------------+
|last_value(a)|last_value(b)|
+-------------+-------------+
|         NULL|            2|
+-------------+-------------+



In [51]:
spark.createDataFrame(
    [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]
).select(last_value('a', True), last_value('b', True)).show()

+-------------+-------------+
|last_value(a)|last_value(b)|
+-------------+-------------+
|            b|            2|
+-------------+-------------+



#### pyspark.sql.functions.max(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the maximum value of the expression in a group.

In [52]:
df = spark.range(10)
df.select(max(col("id"))).show()

+-------+
|max(id)|
+-------+
|      9|
+-------+



#### pyspark.sql.functions.max_by(col: ColumnOrName, ord: ColumnOrName) → pyspark.sql.column.Column
Returns the value associated with the maximum value of ord.

In [54]:
df = spark.createDataFrame([
    ("Java", 2012, 20000), ("dotNET", 2012, 5000),
    ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
    schema=("course", "year", "earnings"))
df.groupby("course").agg(max_by("year", "earnings")).show()

+------+----------------------+
|course|max_by(year, earnings)|
+------+----------------------+
|  Java|                  2013|
|dotNET|                  2013|
+------+----------------------+



#### pyspark.sql.functions.mean(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the average of the values in a group. An alias of avg().

In [55]:
df = spark.range(10)
df.select(mean(df.id)).show()

+-------+
|avg(id)|
+-------+
|    4.5|
+-------+



#### pyspark.sql.functions.median(col: ColumnOrName) → pyspark.sql.column.Column
Returns the median of the values in a group.

In [56]:
df = spark.createDataFrame([
    ("Java", 2012, 20000), ("dotNET", 2012, 5000),
    ("Java", 2012, 22000), ("dotNET", 2012, 10000),
    ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
    schema=("course", "year", "earnings"))
df.groupby("course").agg(median("earnings")).show()

+------+----------------+
|course|median(earnings)|
+------+----------------+
|  Java|         22000.0|
|dotNET|         10000.0|
+------+----------------+



#### pyspark.sql.functions.min(col: ColumnOrName) → pyspark.sql.column.Column¶
Aggregate function: returns the minimum value of the expression in a group.

In [57]:
df = spark.range(10)
df.select(min(df.id)).show()

+-------+
|min(id)|
+-------+
|      0|
+-------+



#### pyspark.sql.functions.min_by(col: ColumnOrName, ord: ColumnOrName) → pyspark.sql.column.Column¶
Returns the value associated with the minimum value of ord.

In [58]:
df = spark.createDataFrame([
    ("Java", 2012, 20000), ("dotNET", 2012, 5000),
    ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
    schema=("course", "year", "earnings"))
df.groupby("course").agg(min_by("year", "earnings")).show()

+------+----------------------+
|course|min_by(year, earnings)|
+------+----------------------+
|  Java|                  2012|
|dotNET|                  2012|
+------+----------------------+



#### pyspark.sql.functions.mode(col: ColumnOrName) → pyspark.sql.column.Column
Returns the most frequent value in a group.

In [59]:
df = spark.createDataFrame([
    ("Java", 2012, 20000), ("dotNET", 2012, 5000),
    ("Java", 2012, 20000), ("dotNET", 2012, 5000),
    ("dotNET", 2013, 48000), ("Java", 2013, 30000)],
    schema=("course", "year", "earnings"))
df.groupby("course").agg(mode("year")).show()

+------+----------+
|course|mode(year)|
+------+----------+
|  Java|      2012|
|dotNET|      2012|
+------+----------+



#### pyspark.sql.functions.percentile(col: ColumnOrName, percentage: Union[pyspark.sql.column.Column, float, List[float], Tuple[float]], frequency: Union[pyspark.sql.column.Column, int] = 1) → pyspark.sql.column.Column
Returns the exact percentile(s) of numeric column expr at the given percentage(s) with value range in [0.0, 1.0].

In [60]:
key = (col("id") % 3).alias("key")
value = (randn(42) + key * 10).alias("value")
df = spark.range(0, 1000, 1, 1).select(key, value)
df.select(
    percentile("value", [0.25, 0.5, 0.75], lit(1)).alias("quantiles")
).show()

+--------------------+
|           quantiles|
+--------------------+
|[0.74419914941216...|
+--------------------+



In [61]:
df.groupBy("key").agg(
    percentile("value", 0.5, lit(1)).alias("median")
).show()

+---+--------------------+
|key|              median|
+---+--------------------+
|  0|-0.03449962216667901|
|  1|   9.990389751837329|
|  2|  19.967859769284075|
+---+--------------------+



#### yspark.sql.functions.percentile_approx(col: ColumnOrName, percentage: Union[pyspark.sql.column.Column, float, List[float], Tuple[float]], accuracy: Union[pyspark.sql.column.Column, float] = 10000) → pyspark.sql.column.Column

Returns the approximate percentile of the numeric column col which is the smallest value in the ordered col values (sorted from least to greatest) such that no more than percentage of col values is less than the value or equal to that value.

In [62]:
key = (col("id") % 3).alias("key")
value = (randn(42) + key * 10).alias("value")
df = spark.range(0, 1000, 1, 1).select(key, value)
df.select(
    percentile_approx("value", [0.25, 0.5, 0.75], 1000000).alias("quantiles")
).printSchema()

root
 |-- quantiles: array (nullable = true)
 |    |-- element: double (containsNull = false)



In [63]:
df.groupBy("key").agg(
    percentile_approx("value", 0.5, lit(1000000)).alias("median")
).printSchema()

root
 |-- key: long (nullable = true)
 |-- median: double (nullable = true)



#### pyspark.sql.functions.product(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the product of the values in a group.

In [64]:
df = spark.range(1, 10).toDF('x').withColumn('mod3', col('x') % 3)
prods = df.groupBy('mod3').agg(product('x').alias('product'))
prods.orderBy('mod3').show()

+----+-------+
|mod3|product|
+----+-------+
|   0|  162.0|
|   1|   28.0|
|   2|   80.0|
+----+-------+



#### pyspark.sql.functions.reduce(col: ColumnOrName, initialValue: ColumnOrName, merge: Callable[[pyspark.sql.column.Column, pyspark.sql.column.Column], pyspark.sql.column.Column], finish: Optional[Callable[[pyspark.sql.column.Column], pyspark.sql.column.Column]] = None) → pyspark.sql.column.Column

Applies a binary operator to an initial state and all elements in the array, and reduces this to a single state. The final state is converted into the final result by applying a finish function.

Both functions can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052).

In [65]:
df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values"))
df.select(reduce("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show()

+----+
| sum|
+----+
|42.0|
+----+



In [66]:
def merge(acc, x):
    count = acc.count + 1
    sum = acc.sum + x
    return struct(count.alias("count"), sum.alias("sum"))

df.select(
    reduce(
        "values",
        struct(lit(0).alias("count"), lit(0.0).alias("sum")),
        merge,
        lambda acc: acc.sum / acc.count,
    ).alias("mean")
).show()

+----+
|mean|
+----+
| 8.4|
+----+



#### pyspark.sql.functions.regr_avgx(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the average of the independent variable for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [67]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_avgx("y", "x")).first()

Row(regr_avgx(y, x)=0.999)

#### pyspark.sql.functions.regr_avgy(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the average of the dependent variable for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [68]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_avgy("y", "x")).first()

Row(regr_avgy(y, x)=9.980732994136464)

#### pyspark.sql.functions.regr_count(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the number of non-null number pairs in a group, where y is the dependent variable and x is the independent variable.

In [69]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_count("y", "x")).first()

Row(regr_count(y, x)=1000)

#### pyspark.sql.functions.regr_intercept(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the intercept of the univariate linear regression line for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [70]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_intercept("y", "x")).first()

Row(regr_intercept(y, x)=-0.04961745990969568)

#### pyspark.sql.functions.regr_r2(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the coefficient of determination for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [71]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_r2("y", "x")).first()

Row(regr_r2(y, x)=0.9851908293645436)

#### pyspark.sql.functions.regr_slope(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the slope of the linear regression line for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [72]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_slope("y", "x")).first()

Row(regr_slope(y, x)=10.040390844891048)

#### pyspark.sql.functions.regr_sxx(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [73]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_sxx("y", "x")).first()

Row(regr_sxx(y, x)=666.9989999999996)

#### pyspark.sql.functions.regr_sxy(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [74]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_sxy("y", "x")).first()

Row(regr_sxy(y, x)=6696.93065315148)

#### pyspark.sql.functions.regr_syy(y: ColumnOrName, x: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs in a group, where y is the dependent variable and x is the independent variable.

In [75]:
x = (col("id") % 3).alias("x")
y = (randn(42) + x * 10).alias("y")
df = spark.range(0, 1000, 1, 1).select(x, y)
df.select(regr_syy("y", "x")).first()

Row(regr_syy(y, x)=68250.53503811295)

#### pyspark.sql.functions.skewness(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the skewness of the values in a group.

In [3]:
df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.select(skewness(df.c)).first()

                                                                                

Row(skewness(c)=0.7071067811865475)

In [4]:
students.select(skewness("math score"), skewness("reading score"), skewness("writing score")).show()

+--------------------+-----------------------+-----------------------+
|skewness(math score)|skewness(reading score)|skewness(writing score)|
+--------------------+-----------------------+-----------------------+
|-0.27851657191407453|   -0.25871569927829374|   -0.28900962452114143|
+--------------------+-----------------------+-----------------------+



In [6]:
students.select(skewness("gender")).first()

Row(skewness(gender)=None)

#### pyspark.sql.functions.some(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns true if at least one value of col is true.

In [7]:
spark.createDataFrame(
    [[True], [True], [True]], ["flag"]
).select(some("flag")).show()

+----------+
|some(flag)|
+----------+
|      true|
+----------+



In [8]:
spark.createDataFrame(
    [[True], [False], [True]], ["flag"]
).select(some("flag")).show()

+----------+
|some(flag)|
+----------+
|      true|
+----------+



In [9]:
spark.createDataFrame(
    [[False], [False], [False]], ["flag"]
).select(some("flag")).show()

+----------+
|some(flag)|
+----------+
|     false|
+----------+



#### pyspark.sql.functions.std(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: alias for stddev_samp.

In [10]:
spark.range(6).select(std("id")).show()

+------------------+
|           std(id)|
+------------------+
|1.8708286933869707|
+------------------+



#### pyspark.sql.functions.stddev(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: alias for stddev_samp.

In [11]:
spark.range(6).select(stddev("id")).show()

+------------------+
|        stddev(id)|
+------------------+
|1.8708286933869707|
+------------------+



In [14]:
students.select(stddev("math score"), stddev("reading score"), stddev("writing score")).show()

+------------------+---------------------+---------------------+
|stddev(math score)|stddev(reading score)|stddev(writing score)|
+------------------+---------------------+---------------------+
|15.163080096009454|   14.600191937252223|    15.19565701086966|
+------------------+---------------------+---------------------+



#### pyspark.sql.functions.stddev_pop(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns population standard deviation of the expression in a group.

In [15]:
spark.range(6).select(stddev_pop("id")).show()

+-----------------+
|   stddev_pop(id)|
+-----------------+
|1.707825127659933|
+-----------------+



In [16]:
students.select(stddev_pop("math score"), stddev_pop("reading score"), stddev_pop("writing score")).show()

+----------------------+-------------------------+-------------------------+
|stddev_pop(math score)|stddev_pop(reading score)|stddev_pop(writing score)|
+----------------------+-------------------------+-------------------------+
|     15.15549665962815|       14.592890015346523|       15.188057281956775|
+----------------------+-------------------------+-------------------------+



#### pyspark.sql.functions.stddev_samp(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the unbiased sample standard deviation of the expression in a group.

In [17]:
spark.range(6).select(stddev_samp("id")).show()

+------------------+
|   stddev_samp(id)|
+------------------+
|1.8708286933869707|
+------------------+



In [18]:
students.select(stddev_samp("math score"), stddev_samp("reading score"), stddev_samp("writing score")).show()

+-----------------------+--------------------------+--------------------------+
|stddev_samp(math score)|stddev_samp(reading score)|stddev_samp(writing score)|
+-----------------------+--------------------------+--------------------------+
|     15.163080096009454|        14.600191937252223|         15.19565701086966|
+-----------------------+--------------------------+--------------------------+



#### pyspark.sql.functions.sum(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the sum of all values in the expression.

In [19]:
df = spark.range(10)
df.select(sum(df["id"])).show()

+-------+
|sum(id)|
+-------+
|     45|
+-------+



In [20]:
students.select(sum("math score")).collect()

[Row(sum(math score)=66089)]

#### pyspark.sql.functions.sum_distinct(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the sum of distinct values in the expression.

In [21]:
df = spark.createDataFrame([(None,), (1,), (1,), (2,)], schema=["numbers"])
df.select(sum_distinct(col("numbers"))).show()

+---------------------+
|sum(DISTINCT numbers)|
+---------------------+
|                    3|
+---------------------+



In [22]:
students.select(sum_distinct("math score")).collect()

[Row(sum(DISTINCT math score)=4808)]

#### pyspark.sql.functions.sumDistinct(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the sum of distinct values in the expression.

    Deprecated since version 3.2.0: Use sum_distinct() instead.

#### pyspark.sql.functions.var_pop(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the population variance of the values in a group.

In [23]:
df = spark.range(6)
df.select(var_pop(df.id)).first()

Row(var_pop(id)=2.9166666666666665)

In [24]:
students.select(var_pop("math score"), var_pop("reading score"), var_pop("writing score")).show()

+-------------------+----------------------+----------------------+
|var_pop(math score)|var_pop(reading score)|var_pop(writing score)|
+-------------------+----------------------+----------------------+
| 229.68907900000005|    212.95243900000023|    230.67708400000024|
+-------------------+----------------------+----------------------+



#### pyspark.sql.functions.var_samp(col: ColumnOrName) → pyspark.sql.column.Column
Aggregate function: returns the unbiased sample variance of the values in a group.

In [25]:
df = spark.range(6)
df.select(var_samp(df.id)).show()

+------------+
|var_samp(id)|
+------------+
|         3.5|
+------------+



In [26]:
students.select(var_samp("math score"), var_samp("reading score"), var_samp("writing score")).show()

+--------------------+-----------------------+-----------------------+
|var_samp(math score)|var_samp(reading score)|var_samp(writing score)|
+--------------------+-----------------------+-----------------------+
|  229.91899799799805|     213.16560460460482|     230.90799199199222|
+--------------------+-----------------------+-----------------------+



#### pyspark.sql.functions.variance(col: ColumnOrName) → pyspark.sql.column.Column¶
Aggregate function: alias for var_samp

In [27]:
df = spark.range(6)
df.select(variance(df.id)).show()

+------------+
|var_samp(id)|
+------------+
|         3.5|
+------------+



In [28]:
students.select(variance("math score"), variance("reading score"), variance("writing score")).show()

+--------------------+-----------------------+-----------------------+
|var_samp(math score)|var_samp(reading score)|var_samp(writing score)|
+--------------------+-----------------------+-----------------------+
|  229.91899799799805|     213.16560460460482|     230.90799199199222|
+--------------------+-----------------------+-----------------------+



## [Window Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#window-functions)

#### pyspark.sql.functions.cume_dist() → pyspark.sql.column.Column
Window function: returns the cumulative distribution of values within a window partition, i.e. the fraction of rows that are below the current row.

In [201]:
df = spark.createDataFrame([1, 2, 3, 3, 4], IntegerType())
w = Window.orderBy("value")
df.withColumn("cd", cume_dist().over(w)).show()

23/10/22 20:48:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:48:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:48:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+---+
|value| cd|
+-----+---+
|    1|0.2|
|    2|0.4|
|    3|0.8|
|    3|0.8|
|    4|1.0|
+-----+---+



23/10/22 20:48:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:48:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [205]:
w = Window.orderBy("math score")
students.orderBy(desc("reading score")).limit(50).withColumn("cd", cume_dist().over(w)).show(50)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+----+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|  cd|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+----+
|female|       group B|         associate's degree|free/reduced|              completed|        76|           94|           87|0.02|
|female|       group B|            master's degree|free/reduced|              completed|        77|           97|           94|0.06|
|female|       group C|          bachelor's degree|    standard|              completed|        77|           94|           95|0.06|
|female|       group D|           some high school|    standard|                   none|        81|           97|           96|0.08|
|female|       group D|               some college|    standard|     

23/10/22 20:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


#### pyspark.sql.functions.dense_rank() → pyspark.sql.column.Column
Window function: returns the rank of rows within a window partition, without any gaps.

The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking sequence when there are ties. That is, if you were ranking a competition using dense_rank and had three people tie for second place, you would say that all three were in second place and that the next person came in third. Rank would give me sequential numbers, making the person that came in third place (after the ties) would register as coming in fifth.

This is equivalent to the DENSE_RANK function in SQL.

In [206]:
df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())
w = Window.orderBy("value")
df.withColumn("drank", dense_rank().over(w)).show()

23/10/22 20:51:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:51:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:51:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+-----+
|value|drank|
+-----+-----+
|    1|    1|
|    1|    1|
|    2|    2|
|    3|    3|
|    3|    3|
|    4|    4|
+-----+-----+



In [207]:
w = Window.orderBy("math score")
students.orderBy(desc("reading score")).limit(50).withColumn("drank", dense_rank().over(w)).show(50)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+-----+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|drank|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+-----+
|female|       group B|         associate's degree|free/reduced|              completed|        76|           94|           87|    1|
|female|       group B|            master's degree|free/reduced|              completed|        77|           97|           94|    2|
|female|       group C|          bachelor's degree|    standard|              completed|        77|           94|           95|    2|
|female|       group D|           some high school|    standard|                   none|        81|           97|           96|    3|
|female|       group D|               some college|    standar

23/10/22 20:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 20:52:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


#### pyspark.sql.functions.lag(col: ColumnOrName, offset: int = 1, default: Optional[Any] = None) → pyspark.sql.column.Column
Window function: returns the value that is offset rows before the current row, and default if there is less than offset rows before the current row. For example, an offset of one will return the previous row at any given point in the window partition.

This is equivalent to the LAG function in SQL.

Parameters:
- col: Column or str | 
name of column or expression
- offset: int, optional default 1 | 
number of row to extend
- default: optional | 
default value

Returns – Column | 
value before current row based on offset.

In [209]:
df = spark.createDataFrame([("a", 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.show()

w = Window.partitionBy("c1").orderBy("c2")
df.withColumn("previos_value", lag("c2").over(w)).show()

+---+---+
| c1| c2|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  8|
|  b|  2|
+---+---+

+---+---+-------------+
| c1| c2|previos_value|
+---+---+-------------+
|  a|  1|         NULL|
|  a|  2|            1|
|  a|  3|            2|
|  b|  2|         NULL|
|  b|  8|            2|
+---+---+-------------+



In [210]:
df.withColumn("previos_value", lag("c2", 1, 0).over(w)).show()

+---+---+-------------+
| c1| c2|previos_value|
+---+---+-------------+
|  a|  1|            0|
|  a|  2|            1|
|  a|  3|            2|
|  b|  2|            0|
|  b|  8|            2|
+---+---+-------------+



In [211]:
df.withColumn("previos_value", lag("c2", 2, -1).over(w)).show()

+---+---+-------------+
| c1| c2|previos_value|
+---+---+-------------+
|  a|  1|           -1|
|  a|  2|           -1|
|  a|  3|            1|
|  b|  2|           -1|
|  b|  8|           -1|
+---+---+-------------+



#### pyspark.sql.functions.lead(col: ColumnOrName, offset: int = 1, default: Optional[Any] = None) → pyspark.sql.column.Column
Window function: returns the value that is offset rows after the current row, and default if there is less than offset rows after the current row. For example, an offset of one will return the next row at any given point in the window partition.

This is equivalent to the LEAD function in SQL.

Parameters:
- col: Column or str | 
name of column or expression
- offset: int, optional default 1 | 
number of row to extend
- default: optional | 
default value

Returns – Column | 
value after current row based on offset.

In [212]:
df = spark.createDataFrame([("a", 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.show()

+---+---+
| c1| c2|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  8|
|  b|  2|
+---+---+



In [213]:
w = Window.partitionBy("c1").orderBy("c2")
df.withColumn("next_value", lead("c2").over(w)).show()

+---+---+----------+
| c1| c2|next_value|
+---+---+----------+
|  a|  1|         2|
|  a|  2|         3|
|  a|  3|      NULL|
|  b|  2|         8|
|  b|  8|      NULL|
+---+---+----------+



In [214]:
df.withColumn("next_value", lead("c2", 1, 0).over(w)).show()

+---+---+----------+
| c1| c2|next_value|
+---+---+----------+
|  a|  1|         2|
|  a|  2|         3|
|  a|  3|         0|
|  b|  2|         8|
|  b|  8|         0|
+---+---+----------+



In [215]:
df.withColumn("next_value", lead("c2", 2, -1).over(w)).show()

+---+---+----------+
| c1| c2|next_value|
+---+---+----------+
|  a|  1|         3|
|  a|  2|        -1|
|  a|  3|        -1|
|  b|  2|        -1|
|  b|  8|        -1|
+---+---+----------+



#### pyspark.sql.functions.nth_value(col: ColumnOrName, offset: int, ignoreNulls: Optional[bool] = False) → pyspark.sql.column.Column
Window function: returns the value that is the offsetth row of the window frame (counting from 1), and null if the size of window frame is less than offset rows.

It will return the offsetth non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned.

This is equivalent to the nth_value function in SQL.

Parameters:
- col: Column or str | 
name of column or expression
- offset: int | 
number of row to use as the value
- ignoreNulls: bool, optional | 
indicates the Nth value should skip null in the determination of which row to use

Returns – Column | 
value of nth row.

In [217]:
from pyspark.sql import Window
df = spark.createDataFrame([("a", 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.show()

+---+---+
| c1| c2|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  8|
|  b|  2|
+---+---+



In [218]:
w = Window.partitionBy("c1").orderBy("c2")
df.withColumn("nth_value", nth_value("c2", 1).over(w)).show()

+---+---+---------+
| c1| c2|nth_value|
+---+---+---------+
|  a|  1|        1|
|  a|  2|        1|
|  a|  3|        1|
|  b|  2|        2|
|  b|  8|        2|
+---+---+---------+



In [219]:
df.withColumn("nth_value", nth_value("c2", 2).over(w)).show()

+---+---+---------+
| c1| c2|nth_value|
+---+---+---------+
|  a|  1|     NULL|
|  a|  2|        2|
|  a|  3|        2|
|  b|  2|     NULL|
|  b|  8|        8|
+---+---+---------+



#### pyspark.sql.functions.ntile(n: int) → pyspark.sql.column.Column
Window function: returns the ntile group id (from 1 to n inclusive) in an ordered window partition. For example, if n is 4, the first quarter of the rows will get value 1, the second quarter will get 2, the third quarter will get 3, and the last quarter will get 4.

This is equivalent to the NTILE function in SQL.

In [222]:
df = spark.createDataFrame([("a", 1),
                            ("a", 2),
                            ("a", 3),
                            ("b", 8),
                            ("b", 2)], ["c1", "c2"])
df.show()

+---+---+
| c1| c2|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  8|
|  b|  2|
+---+---+



In [221]:
w = Window.partitionBy("c1").orderBy("c2")
df.withColumn("ntile", ntile(2).over(w)).show()

+---+---+-----+
| c1| c2|ntile|
+---+---+-----+
|  a|  1|    1|
|  a|  2|    1|
|  a|  3|    2|
|  b|  2|    1|
|  b|  8|    2|
+---+---+-----+



In [227]:
w = Window.partitionBy("parental level of education").orderBy("math score")
students.withColumn("ntile", ntile(10).over(w)).show(1000)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+-----+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|ntile|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+-----+
|female|       group D|         associate's degree|free/reduced|                   none|        26|           31|           38|    1|
|female|       group A|         associate's degree|free/reduced|                   none|        37|           57|           56|    1|
|female|       group C|         associate's degree|    standard|                   none|        39|           64|           57|    1|
|  male|       group D|         associate's degree|    standard|                   none|        40|           52|           43|    1|
|female|       group B|         associate's degree|    standar

#### pyspark.sql.functions.percent_rank() → pyspark.sql.column.Column
Window function: returns the relative rank (i.e. percentile) of rows within a window partition.

In [229]:
df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())
w = Window.orderBy("value")
df.withColumn("pr", percent_rank().over(w)).show()

23/10/22 21:15:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:15:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:15:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+---+
|value| pr|
+-----+---+
|    1|0.0|
|    1|0.0|
|    2|0.4|
|    3|0.6|
|    3|0.6|
|    4|1.0|
+-----+---+



23/10/22 21:15:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:15:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [241]:
w = Window.partitionBy("parental level of education").orderBy("math score")
students.withColumn("pr", percent_rank().over(w)).select("math score", "pr").distinct().show(100)

+----------+--------------------+
|math score|                  pr|
+----------+--------------------+
|        44| 0.05128205128205128|
|        49| 0.08547008547008547|
|        58| 0.27586206896551724|
|        77|                0.76|
|        90|  0.9095022624434389|
|        53|  0.1452991452991453|
|        79|  0.6379310344827587|
|        60| 0.28444444444444444|
|        59|  0.3089887640449438|
|        35|  0.0449438202247191|
|        59| 0.37435897435897436|
|        92|  0.9775280898876404|
|        61| 0.23076923076923078|
|        41| 0.07865168539325842|
|        51| 0.12669683257918551|
|        57| 0.24434389140271492|
|        63| 0.49743589743589745|
|        69|  0.6461538461538462|
|        53|  0.1724137931034483|
|        69|  0.5288888888888889|
|        58| 0.28054298642533937|
|        29|                 0.0|
|        60|  0.3595505617977528|
|        61|   0.334841628959276|
|        95|  0.9683257918552036|
|        74|                 0.8|
|        88|  

#### pyspark.sql.functions.rank() → pyspark.sql.column.Column
Window function: returns the rank of rows within a window partition.

The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking sequence when there are ties. That is, if you were ranking a competition using dense_rank and had three people tie for second place, you would say that all three were in second place and that the next person came in third. Rank would give me sequential numbers, making the person that came in third place (after the ties) would register as coming in fifth.

This is equivalent to the RANK function in SQL.

In [237]:
df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())
w = Window.orderBy("value")
df.withColumn("drank", rank().over(w)).show()

23/10/22 21:20:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:20:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:20:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+-----+
|value|drank|
+-----+-----+
|    1|    1|
|    1|    1|
|    2|    3|
|    3|    4|
|    3|    4|
|    4|    6|
+-----+-----+



In [242]:
w = Window.partitionBy("parental level of education").orderBy("math score")
students.withColumn("drank", rank().over(w)).select("math score", "drank").distinct().show(100)

+----------+-----+
|math score|drank|
+----------+-----+
|        68|   57|
|        73|  152|
|        47|   24|
|        72|  131|
|        84|  187|
|        90|  202|
|        92|  211|
|        72|   69|
|        64|   96|
|        58|   67|
|        68|  116|
|        78|  148|
|        89|  106|
|        42|   10|
|        76|  168|
|        64|  102|
|        81|  169|
|        66|   48|
|        48|   30|
|        50|   37|
|        41|    8|
|        55|   20|
|        98|  221|
|        81|   94|
|        39|    3|
|        88|  200|
|        60|   80|
|        69|   61|
|        76|  148|
|        27|    4|
|        99|  223|
|        52|   27|
|        62|   32|
|        70|   27|
|        68|  114|
|        39|   13|
|        53|   18|
|        64|   41|
|        78|   84|
|        75|  137|
|        51|   14|
|        75|   75|
|        87|   51|
|        66|  106|
|        53|   38|
|        90|   54|
|        84|  163|
|        71|   66|
|        92|  109|
|        69|

#### pyspark.sql.functions.row_number() → pyspark.sql.column.Column
Window function: returns a sequential number starting at 1 within a window partition.

In [240]:
df = spark.range(3)
w = Window.orderBy(df.id.desc())
df.withColumn("desc_order", row_number().over(w)).show()

+---+----------+
| id|desc_order|
+---+----------+
|  2|         1|
|  1|         2|
|  0|         3|
+---+----------+



23/10/22 21:21:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:21:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:21:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:21:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/22 21:21:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [246]:
w = Window.partitionBy("parental level of education").orderBy("math score")
students.withColumn("desc_order", row_number().over(w)).select("math score", "desc_order").distinct().limit(20).orderBy(asc("desc_order")).show()

+----------+----------+
|math score|desc_order|
+----------+----------+
|        32|         8|
|        47|        24|
|        53|        49|
|        68|        57|
|        68|        60|
|        69|        62|
|        72|        69|
|        61|        77|
|        62|        78|
|        63|        81|
|        60|        84|
|        80|        93|
|        69|       112|
|        72|       131|
|        73|       152|
|        73|       153|
|        84|       185|
|        84|       187|
|        90|       202|
|        92|       211|
+----------+----------+



## [Sort Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#sort-functions)

#### pyspark.sql.functions.asc(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sort expression based on the ascending order of the given column name.

In [182]:
df = spark.range(5)
df = df.sort(desc("id"))
df.show()

+---+
| id|
+---+
|  4|
|  3|
|  2|
|  1|
|  0|
+---+



In [183]:
df.orderBy(asc("id")).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [189]:
students.orderBy(asc("math score")).toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group C,some high school,free/reduced,none,0,17,10
1,female,group B,high school,free/reduced,none,8,24,23
2,female,group B,some high school,free/reduced,none,18,32,28
3,female,group B,some college,standard,none,19,38,32
4,female,group C,some college,free/reduced,none,22,39,33
...,...,...,...,...,...,...,...,...
995,female,group E,bachelor's degree,standard,none,100,100,100
996,male,group A,some college,standard,completed,100,96,86
997,male,group D,some college,standard,completed,100,97,99
998,male,group E,bachelor's degree,standard,completed,100,100,100


#### pyspark.sql.functions.asc_nulls_first(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sort expression based on the ascending order of the given column name, and null values return before non-null values.

In [184]:
df1 = spark.createDataFrame([(1, "Bob"),
                             (0, None),
                             (2, "Alice")], ["age", "name"])
df1.sort(asc_nulls_first(df1.name)).show()

+---+-----+
|age| name|
+---+-----+
|  0| NULL|
|  2|Alice|
|  1|  Bob|
+---+-----+



#### pyspark.sql.functions.asc_nulls_last(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sort expression based on the ascending order of the given column name, and null values appear after non-null values.

In [185]:
df1 = spark.createDataFrame([(0, None),
                             (1, "Bob"),
                             (2, "Alice")], ["age", "name"])
df1.sort(asc_nulls_last(df1.name)).show()

+---+-----+
|age| name|
+---+-----+
|  2|Alice|
|  1|  Bob|
|  0| NULL|
+---+-----+



In [192]:
tour.orderBy(asc("Value")).show()

+----------------+------+--------------------+--------------------+-----+
|     Player Name|Season|           Statistic|            Variable|Value|
+----------------+------+--------------------+--------------------+-----+
|   Jordan Spieth|  2018| Par 4 Eagle Leaders|Par 4 Eagle Leade...| NULL|
|   Gary Woodland|  2016|% of Potential Pt...|% of Potential Pt...| NULL|
|Brendon de Jonge|  2010|FedExCup Season P...|FedExCup Season P...| NULL|
|      Jon Curran|  2016|% of Potential Pt...|% of Potential Pt...| NULL|
| Charley Hoffman|  2017|FedExCup Season P...|FedExCup Season P...| NULL|
| David Lingmerth|  2016|% of Potential Pt...|% of Potential Pt...| NULL|
|      Charlie Wi|  2010|FedExCup Season P...|FedExCup Season P...| NULL|
|    Patrick Reed|  2016|% of Potential Pt...|% of Potential Pt...| NULL|
|    Bubba Watson|  2013|FedExCup Season P...|FedExCup Season P...| NULL|
|    Zach Johnson|  2016|% of Potential Pt...|% of Potential Pt...| NULL|
|      Paul Casey|  2010|FedExCup Seas

                                                                                

In [193]:
tour.orderBy(asc_nulls_last("Value")).show()

+--------------+------+--------------------+--------------------+----------+
|   Player Name|Season|           Statistic|            Variable|     Value|
+--------------+------+--------------------+--------------------+----------+
|   Chris Riley|  2010|Percentage of Ava...|Percentage of Ava...|$1,001,580|
|   Chris Riley|  2010|Percentage of pot...|Percentage of pot...|$1,001,581|
|   Chris Riley|  2010|Money per Event L...|Money per Event L...|$1,001,581|
|Steve Stricker|  2017|      Official Money|Official Money - ...|$1,002,036|
|Steve Stricker|  2017|Percentage of Ava...|Percentage of Ava...|$1,002,036|
|Steve Stricker|  2017|Money per Event L...|Money per Event L...|$1,002,036|
|Steve Stricker|  2017|Percentage of pot...|Percentage of pot...|$1,002,036|
|  Robert Streb|  2016|Percentage of Ava...|Percentage of Ava...|$1,003,359|
|  Robert Streb|  2016|Percentage of pot...|Percentage of pot...|$1,003,362|
|  Robert Streb|  2016|Money per Event L...|Money per Event L...|$1,003,362|

                                                                                

#### pyspark.sql.functions.desc(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sort expression based on the descending order of the given column name.

In [187]:
spark.range(5).orderBy(desc("id")).show()

+---+
| id|
+---+
|  4|
|  3|
|  2|
|  1|
|  0|
+---+



In [191]:
tour.orderBy(desc("Value")).show()

+------------------+------+--------------------+--------------------+--------------------+
|       Player Name|Season|           Statistic|            Variable|               Value|
+------------------+------+--------------------+--------------------+--------------------+
|    Soren Kjeldsen|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|      Jason Dufner|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Vijay Singh|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|     Scott Gregory|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|      Ryan Ruffels|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Andrew Dorn|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|         John Hahn|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Kenny Perry|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|

                                                                                

#### pyspark.sql.functions.desc_nulls_first(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sort expression based on the descending order of the given column name, and null values appear before non-null values.

In [195]:
df1 = spark.createDataFrame([(0, None),
                             (1, "Bob"),
                             (2, "Alice")], ["age", "name"])
df1.sort(desc_nulls_first(df1.name)).show()

+---+-----+
|age| name|
+---+-----+
|  0| NULL|
|  1|  Bob|
|  2|Alice|
+---+-----+



#### pyspark.sql.functions.desc_nulls_last(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sort expression based on the descending order of the given column name, and null values appear after non-null values.

In [196]:
df1 = spark.createDataFrame([(0, None),
                             (1, "Bob"),
                             (2, "Alice")], ["age", "name"])
df1.sort(desc_nulls_last(df1.name)).show()

+---+-----+
|age| name|
+---+-----+
|  1|  Bob|
|  2|Alice|
|  0| NULL|
+---+-----+



In [197]:
tour.orderBy(desc_nulls_last("Value")).show()

[Stage 276:>                                                      (0 + 12) / 12]

+------------------+------+--------------------+--------------------+--------------------+
|       Player Name|Season|           Statistic|            Variable|               Value|
+------------------+------+--------------------+--------------------+--------------------+
|      Jason Dufner|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Vijay Singh|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|    Soren Kjeldsen|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Andrew Dorn|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|     Scott Gregory|  2017|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Kenny Perry|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|      Ryan Ruffels|  2016|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|
|       Harry Ellis|  2018|        Lowest Round|Lowest Round - (T...|     the Memorial/Mu|

                                                                                

## [String Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#string-functions)

#### pyspark.sql.functions.ascii(col: ColumnOrName) → pyspark.sql.column.Column
Computes the numeric value of the first character of the string column.

In [248]:
df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")
df.select(ascii("value")).show()

+------------+
|ascii(value)|
+------------+
|          83|
|          80|
|          80|
+------------+



#### pyspark.sql.functions.base64(col: ColumnOrName) → pyspark.sql.column.Column
Computes the BASE64 encoding of a binary column and returns it as a string column.

In [250]:
df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")
df.select(base64("value")).show()

+----------------+
|   base64(value)|
+----------------+
|        U3Bhcms=|
|    UHlTcGFyaw==|
|UGFuZGFzIEFQSQ==|
+----------------+



#### pyspark.sql.functions.bit_length(col: ColumnOrName) → pyspark.sql.column.Column
Calculates the bit length for the specified string column.

In [257]:
df = spark.createDataFrame([('cat',), ( '🐈',), ( 'кіт',)], ['cat'])
df.select(bit_length('cat')).collect()

[Row(bit_length(cat)=24), Row(bit_length(cat)=32), Row(bit_length(cat)=48)]

#### pyspark.sql.functions.btrim(str: ColumnOrName, trim: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Remove the leading and trailing trim characters from str.

In [262]:
df = spark.createDataFrame([("SSparkSQLS", "SL", )], ['a', 'b'])
df.select(btrim(df.a, df.b).alias('r')).collect()

[Row(r='parkSQ')]

In [260]:
df = spark.createDataFrame([("    SparkSQL   ",)], ['a'])
df.select(btrim(df.a).alias('r')).collect()

[Row(r='SparkSQL')]

#### pyspark.sql.functions.char(col: ColumnOrName) → pyspark.sql.column.Column
Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256)

In [263]:
spark.range(1).select(char(lit(65))).show()

+--------+
|char(65)|
+--------+
|       A|
+--------+



#### pyspark.sql.functions.character_length(str: ColumnOrName) → pyspark.sql.column.Column
Returns the character length of string data or number of bytes of binary data. The length of string data includes the trailing spaces. The length of binary data includes binary zeros.

In [264]:
spark.range(1).select(sf.character_length(sf.lit("SparkSQL"))).show()

+--------------------------+
|character_length(SparkSQL)|
+--------------------------+
|                         8|
+--------------------------+



#### pyspark.sql.functions.char_length(str: ColumnOrName) → pyspark.sql.column.Column
Returns the character length of string data or number of bytes of binary data. The length of string data includes the trailing spaces. The length of binary data includes binary zeros.

In [265]:
spark.range(1).select(char_length(lit("SparkSQL"))).show()

+---------------------+
|char_length(SparkSQL)|
+---------------------+
|                    8|
+---------------------+



#### pyspark.sql.functions.concat_ws(sep: str, *cols: ColumnOrName) → pyspark.sql.column.Column¶
Concatenates multiple input string columns together into a single string column, using the given separator.

In [266]:
df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
df.select(concat_ws('-', df.s, df.d).alias('s')).collect()

[Row(s='abcd-123')]

#### pyspark.sql.functions.contains(left: ColumnOrName, right: ColumnOrName) → pyspark.sql.column.Column
Returns a boolean. The value is True if right is found inside left. Returns NULL if either input expression is NULL. Otherwise, returns False. Both left or right must be of STRING or BINARY type.

Parameters:
- left: Column or str | 
The input column or strings to check, may be NULL.
- right: Column or str | 
The input column or strings to find, may be NULL.

In [267]:
df = spark.createDataFrame([("Spark SQL", "Spark")], ['a', 'b'])
df.select(contains(df.a, df.b).alias('r')).collect()

[Row(r=True)]

In [268]:
df = spark.createDataFrame([("414243", "4243",)], ["c", "d"])
df = df.select(to_binary("c").alias("c"), to_binary("d").alias("d"))
df.printSchema()

root
 |-- c: binary (nullable = true)
 |-- d: binary (nullable = true)



In [269]:
df.select(contains("c", "d"), contains("d", "c")).show()

+--------------+--------------+
|contains(c, d)|contains(d, c)|
+--------------+--------------+
|          true|         false|
+--------------+--------------+



#### pyspark.sql.functions.decode(col: ColumnOrName, charset: str) → pyspark.sql.column.Column
Computes the first argument into a string from a binary using the provided character set (one of ‘US-ASCII’, ‘ISO-8859-1’, ‘UTF-8’, ‘UTF-16BE’, ‘UTF-16LE’, ‘UTF-16’).

In [270]:
df = spark.createDataFrame([('abcd',)], ['a'])
df.select(decode("a", "UTF-8")).show()

+----------------+
|decode(a, UTF-8)|
+----------------+
|            abcd|
+----------------+



In [277]:
df = spark.createDataFrame([('кіт кит дельфін',)], ['a'])
df.select(decode("a", "UTF-8")).show()

+----------------+
|decode(a, UTF-8)|
+----------------+
| кіт кит дельфін|
+----------------+



#### pyspark.sql.functions.elt(*inputs: ColumnOrName) → pyspark.sql.column.Column
Returns the n-th input, e.g., returns input2 when n is 2. The function returns NULL if the index exceeds the length of the array and spark.sql.ansi.enabled is set to false. If spark.sql.ansi.enabled is set to true, it throws ArrayIndexOutOfBoundsException for invalid indices.

In [278]:
df = spark.createDataFrame([(1, "scala", "java")], ['a', 'b', 'c'])
df.select(elt(df.a, df.b, df.c).alias('r')).collect()

[Row(r='scala')]

In [281]:
df = spark.createDataFrame([(1, "scala", "java")], ['a', 'b', 'c'])
df.select(elt(df.a, df.b, df.c).alias('r')).collect()

[Row(r='scala')]

In [282]:
df = spark.createDataFrame([("ggg", "scala", "java")], ['a', 'b', 'c'])
df.select(elt(lit(2), df.b, df.c).alias('r')).collect()

[Row(r='java')]

In [286]:
df.select(elt(lit(0), df.b, df.c).alias('r'), elt(lit(3), df.b, df.c).alias('r')).collect()

[Row(r=None, r=None)]

#### pyspark.sql.functions.encode(col: ColumnOrName, charset: str) → pyspark.sql.column.Column
Computes the first argument into a binary from a string using the provided character set (one of ‘US-ASCII’, ‘ISO-8859-1’, ‘UTF-8’, ‘UTF-16BE’, ‘UTF-16LE’, ‘UTF-16’).

In [287]:
df = spark.createDataFrame([('abcd',)], ['c'])
df.select(encode("c", "UTF-8")).show()

+----------------+
|encode(c, UTF-8)|
+----------------+
|   [61 62 63 64]|
+----------------+



In [294]:
df = spark.createDataFrame([('кіт кит дельфін',)], ['c'])
df = df.select(encode("c", "UTF-16").alias("encode"))
df.show(truncate=False)
df.select(decode("encode", "UTF-16")).show()

+-------------------------------------------------------------------------------------------------+
|encode                                                                                           |
+-------------------------------------------------------------------------------------------------+
|[FE FF 04 3A 04 56 04 42 00 20 04 3A 04 38 04 42 00 20 04 34 04 35 04 3B 04 4C 04 44 04 56 04 3D]|
+-------------------------------------------------------------------------------------------------+

+----------------------+
|decode(encode, UTF-16)|
+----------------------+
|       кіт кит дельфін|
+----------------------+



#### pyspark.sql.functions.endswith(str: ColumnOrName, suffix: ColumnOrName) → pyspark.sql.column.Column
Returns a boolean. The value is True if str ends with suffix. Returns NULL if either input expression is NULL. Otherwise, returns False. Both str or suffix must be of STRING or BINARY type.

In [295]:
df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])
df.select(endswith(df.a, df.b).alias('r')).collect()

[Row(r=False)]

In [296]:
df = spark.createDataFrame([("414243", "4243",)], ["e", "f"])
df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))
df.printSchema()

root
 |-- e: binary (nullable = true)
 |-- f: binary (nullable = true)



In [297]:
df.select(endswith("e", "f"), endswith("f", "e")).show()

+--------------+--------------+
|endswith(e, f)|endswith(f, e)|
+--------------+--------------+
|          true|         false|
+--------------+--------------+



#### pyspark.sql.functions.find_in_set(str: ColumnOrName, str_array: ColumnOrName) → pyspark.sql.column.Column
Returns the index (1-based) of the given string (str) in the comma-delimited list (strArray). Returns 0, if the string was not found or if the given string (str) contains a comma.

In [298]:
df = spark.createDataFrame([("ab", "abc,b,ab,c,def")], ['a', 'b'])
df.select(find_in_set(df.a, df.b).alias('r')).collect()

[Row(r=3)]

#### pyspark.sql.functions.format_number(col: ColumnOrName, d: int) → pyspark.sql.column.Column
Formats the number X to a format like ‘#,–#,–#.–’, rounded to d decimal places with HALF_EVEN round mode, and returns the result as a string.

In [299]:
spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()

[Row(v='5.0000')]

#### pyspark.sql.functions.format_string(format: str, *cols: ColumnOrName) → pyspark.sql.column.Column
Formats the arguments in printf-style and returns the result as a string column.

In [300]:
df = spark.createDataFrame([(5, "hello")], ['a', 'b'])
df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()

[Row(v='5 hello')]

#### pyspark.sql.functions.ilike(str: ColumnOrName, pattern: ColumnOrName, escapeChar: Optional[Column] = None) → pyspark.sql.column.Column
Returns true if str matches pattern with escape case-insensitively, null if any arguments are null, false otherwise. The default escape character is the ‘’.

Parameters: 
- str: Column or str | 
A string.
- pattern: Column or str |
A string. The pattern is a string which is matched literally, with exception to the following special symbols: _ matches any one character in the input (similar to . in posix regular expressions) % matches zero or more characters in the input (similar to .* in posix regular expressions) Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order to match “bc”, the pattern should be “abc”. When SQL config ‘spark.sql.parser.escapedStringLiterals’ is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the pattern to match “bc” should be “bc”.
- escape: Column |
An character added since Spark 3.0. The default escape character is the ‘’. If an escape character precedes a special symbol or another escape character, the following character is matched literally. It is invalid to escape any other character.

In [301]:
df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])
df.select(ilike(df.a, df.b).alias('r')).collect()

[Row(r=True)]

In [302]:
df = spark.createDataFrame(
    [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],
    ['a', 'b']
)
df.select(ilike(df.a, df.b, lit('/')).alias('r')).collect()

[Row(r=True)]

#### pyspark.sql.functions.initcap(col: ColumnOrName) → pyspark.sql.column.Column
Translate the first letter of each word to upper case in the sentence.

In [304]:
spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()

[Row(v='Ab Cd')]

#### pyspark.sql.functions.instr(str: ColumnOrName, substr: str) → pyspark.sql.column.Column
Locate the position of the first occurrence of substr column in the given string. Returns null if either of the arguments are null.

##### Notes

The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str.

In [306]:
df = spark.createDataFrame([('abcd',)], ['s',])
df.select(instr(df.s, 'b').alias('s')).collect()

[Row(s=2)]

#### pyspark.sql.functions.lcase(str: ColumnOrName) → pyspark.sql.column.Column
Returns str with all characters changed to lowercase.

In [307]:
spark.range(1).select(sf.lcase(sf.lit("Spark"))).show()

+------------+
|lcase(Spark)|
+------------+
|       spark|
+------------+



#### pyspark.sql.functions.length(col: ColumnOrName) → pyspark.sql.column.Column
Computes the character length of string data or number of bytes of binary data. The length of character data includes the trailing spaces. The length of binary data includes binary zeros.

In [308]:
spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()

[Row(length=4)]

#### pyspark.sql.functions.like(str: ColumnOrName, pattern: ColumnOrName, escapeChar: Optional[Column] = None) → pyspark.sql.column.Column
Returns true if str matches pattern with escape, null if any arguments are null, false otherwise. The default escape character is the ‘’.

Parameters:
- str: Column or str |
A string.
- pattern: Column or str | 
A string. The pattern is a string which is matched literally, with exception to the following special symbols: _ matches any one character in the input (similar to . in posix regular expressions) % matches zero or more characters in the input (similar to .* in posix regular expressions) Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order to match “bc”, the pattern should be “abc”. When SQL config ‘spark.sql.parser.escapedStringLiterals’ is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the pattern to match “bc” should be “bc”.
- escape: Column |
An character added since Spark 3.0. The default escape character is the ‘’. If an escape character precedes a special symbol or another escape character, the following character is matched literally. It is invalid to escape any other character.

In [309]:
df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])
df.select(like(df.a, df.b).alias('r')).collect()

[Row(r=True)]

In [310]:
df = spark.createDataFrame(
    [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],
    ['a', 'b']
)
df.select(like(df.a, df.b, lit('/')).alias('r')).collect()

[Row(r=True)]

#### pyspark.sql.functions.lower(col: ColumnOrName) → pyspark.sql.column.Column¶
Converts a string expression to lower case.

In [311]:
df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")
df.select(lower("value")).show()

+------------+
|lower(value)|
+------------+
|       spark|
|     pyspark|
|  pandas api|
+------------+



#### pyspark.sql.functions.left(str: ColumnOrName, len: ColumnOrName) → pyspark.sql.column.Column
    Returns the leftmost len`(`len can be string type) characters from the string str, if len is less or equal than 0 the result is an empty string.

In [312]:
df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])
df.select(left(df.a, df.b).alias('r')).collect()

[Row(r='Spa')]

#### pyspark.sql.functions.levenshtein(left: ColumnOrName, right: ColumnOrName, threshold: Optional[int] = None) → pyspark.sql.column.Column
Computes the Levenshtein distance of the two given strings.

Parameters:
- leftColumn or str
first column value.
- right: Column or str | 
second column value.
- threshold: int, optional | 
if set when the levenshtein distance of the two given strings less than or equal to a given threshold then return result distance, or -1

Returns – Column | Levenshtein distance as integer value.

In [313]:
df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
df0.select(levenshtein('l', 'r').alias('d')).collect()

[Row(d=3)]

In [314]:
df0.select(levenshtein('l', 'r', 2).alias('d')).collect()

[Row(d=-1)]

#### pyspark.sql.functions.locate(substr: str, str: ColumnOrName, pos: int = 1) → pyspark.sql.column.Column
Locate the position of the first occurrence of substr in a string column, after position pos.

##### Notes

The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str.

In [315]:
df = spark.createDataFrame([('abcd',)], ['s',])
df.select(locate('b', df.s, 1).alias('s')).collect()

[Row(s=2)]

#### pyspark.sql.functions.lpad(col: ColumnOrName, len: int, pad: str) → pyspark.sql.column.Column
Left-pad the string column to width len with pad.

In [316]:
df = spark.createDataFrame([('abcd',)], ['s',])
df.select(lpad(df.s, 6, '#').alias('s')).collect()

[Row(s='##abcd')]

In [318]:
tour.limit(10).select(lpad(tour["Player name"], 20, '~').alias('s')).show()

+--------------------+
|                   s|
+--------------------+
|~~~~~Robert Garrigus|
|~~~~~~~~Bubba Watson|
|~~~~~~Dustin Johnson|
|~~~~~Brett Wetterich|
|~~~~~~~~~J.B. Holmes|
|~~~~~~~~~~~John Daly|
|~~~~~~~Graham DeLaet|
|~~~~~~~Angel Cabrera|
|~~~~~~Charles Warren|
|~~~~~~~~~D.J. Trahan|
+--------------------+



#### pyspark.sql.functions.ltrim(col: ColumnOrName) → pyspark.sql.column.Column
Trim the spaces from left end for the specified string value.

In [319]:
df = spark.createDataFrame(["   Spark", "Spark  ", " Spark"], "STRING")
df.select(ltrim("value").alias("r")).withColumn("length", length("r")).show()

+-------+------+
|      r|length|
+-------+------+
|  Spark|     5|
|Spark  |     7|
|  Spark|     5|
+-------+------+



#### pyspark.sql.functions.mask(col: ColumnOrName, upperChar: Optional[ColumnOrName] = None, lowerChar: Optional[ColumnOrName] = None, digitChar: Optional[ColumnOrName] = None, otherChar: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Masks the given string value. This can be useful for creating copies of tables with sensitive information removed.

Parameters:

    col: :class:`~pyspark.sql.Column` or str
        target column to compute on.

    upperChar: :class:`~pyspark.sql.Column` or str
        character to replace upper-case characters with. Specify NULL to retain original character.

    lowerChar: :class:`~pyspark.sql.Column` or str
        character to replace lower-case characters with. Specify NULL to retain original character.

    digitChar: :class:`~pyspark.sql.Column` or str
        character to replace digit characters with. Specify NULL to retain original character.

    otherChar: :class:`~pyspark.sql.Column` or str
        character to replace all other characters with. Specify NULL to retain original character.

Returns | Column

In [320]:
df = spark.createDataFrame([("AbCD123-@$#",), ("abcd-EFGH-8765-4321",)], ['data'])
df.select(mask(df.data).alias('r')).collect()

[Row(r='XxXXnnn-@$#'), Row(r='xxxx-XXXX-nnnn-nnnn')]

In [321]:
df.select(mask(df.data, lit('Y')).alias('r')).collect()

[Row(r='YxYYnnn-@$#'), Row(r='xxxx-YYYY-nnnn-nnnn')]

In [322]:
df.select(mask(df.data, lit('Y'), lit('y')).alias('r')).collect()

[Row(r='YyYYnnn-@$#'), Row(r='yyyy-YYYY-nnnn-nnnn')]

In [323]:
df.select(mask(df.data, lit('Y'), lit('y'), lit('d')).alias('r')).collect()

[Row(r='YyYYddd-@$#'), Row(r='yyyy-YYYY-dddd-dddd')]

In [324]:
df.select(mask(df.data, lit('Y'), lit('y'), lit('d'), lit('*')).alias('r')).collect()

[Row(r='YyYYddd****'), Row(r='yyyy*YYYY*dddd*dddd')]

In [326]:
df = spark.createDataFrame([("AbCD123-@$#",),
                            ("abcd-EFGH-8765-4321",),
                            ("1234-5678-8765-4321",),
                            ("12345678",),
                            ("Абаба Галамага",),
                            ("Тест_1234_!'№;_test",),
                            ("k[MY0&[&jwts",),
                           ], ['data'])
df.select(mask(df.data).alias('r')).collect()

[Row(r='XxXXnnn-@$#'),
 Row(r='xxxx-XXXX-nnnn-nnnn'),
 Row(r='nnnn-nnnn-nnnn-nnnn'),
 Row(r='nnnnnnnn'),
 Row(r='Xxxxx Xxxxxxxx'),
 Row(r="Xxxx_nnnn_!'№;_xxxx"),
 Row(r='x[XXn&[&xxxx')]

#### pyspark.sql.functions.octet_length(col: ColumnOrName) → pyspark.sql.column.Column
Calculates the byte length for the specified string column.

In [327]:
spark.createDataFrame([('cat',), ( '🐈',)], ['cat']) \
     .select(octet_length('cat')).collect()

[Row(octet_length(cat)=3), Row(octet_length(cat)=4)]

#### pyspark.sql.functions.parse_url(url: ColumnOrName, partToExtract: ColumnOrName, key: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Extracts a part from a URL.

    'HOST': Домен (наприклад, www.example.com).
    'PATH': Шлях (наприклад, /page1).
    'QUERY': Рядок запиту (наприклад, ?param1=value1&param2=value2).
    'SCHEME': Протокол (наприклад, http).
    'FILE': Шлях до файлу.
    'REF': Фрагмент (якщо він вказаний у URL).

In [328]:
df = spark.createDataFrame(
    [("http://spark.apache.org/path?query=1", "QUERY", "query",)],
    ["a", "b", "c"]
)
df.select(parse_url(df.a, df.b, df.c).alias('r')).collect()

[Row(r='1')]

In [329]:
df.select(parse_url(df.a, df.b).alias('r')).collect()

[Row(r='query=1')]

In [336]:
df.select(parse_url(df.a, lit("PATH")).alias('r')).collect()

[Row(r='/path')]

In [340]:
df.select(parse_url(df.a, lit("HOST")).alias('r')).collect()

[Row(r='spark.apache.org')]

#### pyspark.sql.functions.position(substr: ColumnOrName, str: ColumnOrName, start: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Returns the position of the first occurrence of substr in str after position start. The given start and return value are 1-based.

In [344]:
spark.createDataFrame(
    [("bar", "foobarbar", 5,)], ["a", "b", "c"]
).select(position("a", "b", "c")).show()

+-----------------+
|position(a, b, c)|
+-----------------+
|                7|
+-----------------+



In [345]:
spark.createDataFrame(
    [("bar", "foobarbar", 5,)], ["a", "b", "c"]
).select(position("a", "b")).show()

+-----------------+
|position(a, b, 1)|
+-----------------+
|                4|
+-----------------+



#### pyspark.sql.functions.printf(format: ColumnOrName, *cols: ColumnOrName) → pyspark.sql.column.Column
Formats the arguments in printf-style and returns the result as a string column.

In [346]:
spark.createDataFrame(
    [("aa%d%sqwe", 123, "cc",)], ["a", "b", "c"]
).select(printf("a", "b", "c")).show()

+---------------+
|printf(a, b, c)|
+---------------+
|     aa123ccqwe|
+---------------+



#### pyspark.sql.functions.rlike(str: ColumnOrName, regexp: ColumnOrName) → pyspark.sql.column.Column
Returns true if str matches the Java regex regexp, or false otherwise.

In [347]:
df = spark.createDataFrame([("1a 2b 14m", r"(\d+)")], ["str", "regexp"])
df.select(rlike('str', lit(r'(\d+)')).alias('d')).collect()

[Row(d=True)]

In [348]:
df.select(rlike('str', lit(r'\d{2}b')).alias('d')).collect()

[Row(d=False)]

In [349]:
df.select(rlike("str", col("regexp")).alias('d')).collect()

[Row(d=True)]

#### pyspark.sql.functions.regexp(str: ColumnOrName, regexp: ColumnOrName) → pyspark.sql.column.Column
Returns true if str matches the Java regex regexp, or false otherwise.

Parameters:
- str: Column or str | 
target column to work on.
- regexp: Column or str | 
regex pattern to apply.

Returns – Column | true if str matches a Java regex, or false otherwise.

In [351]:
spark.createDataFrame(
    [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
).select(regexp('str', lit(r'(\d+)'))).show()

+------------------+
|REGEXP(str, (\d+))|
+------------------+
|              true|
+------------------+



In [352]:
spark.createDataFrame(
    [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
).select(regexp('str', lit(r'\d{2}b'))).show()

+-------------------+
|REGEXP(str, \d{2}b)|
+-------------------+
|              false|
+-------------------+



In [3]:
spark.createDataFrame(
    [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
).select(regexp('str', col("regexp"))).show()

                                                                                

+-------------------+
|REGEXP(str, regexp)|
+-------------------+
|               true|
+-------------------+



#### pyspark.sql.functions.regexp_like(str: ColumnOrName, regexp: ColumnOrName) → pyspark.sql.column.Column
Returns true if str matches the Java regex regexp, or false otherwise.

In [4]:
spark.createDataFrame(
    [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
).select(regexp_like('str', lit(r'(\d+)'))).show()

+-----------------------+
|REGEXP_LIKE(str, (\d+))|
+-----------------------+
|                   true|
+-----------------------+



In [5]:
spark.createDataFrame(
    [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
).select(regexp_like('str', lit(r'\d{2}b'))).show()

+------------------------+
|REGEXP_LIKE(str, \d{2}b)|
+------------------------+
|                   false|
+------------------------+



In [8]:
spark.createDataFrame(
    [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]
).select(regexp_like('str', col("regexp"))).show()

+------------------------+
|REGEXP_LIKE(str, regexp)|
+------------------------+
|                    true|
+------------------------+



#### pyspark.sql.functions.regexp_count(str: ColumnOrName, regexp: ColumnOrName) → pyspark.sql.column.Column
Returns a count of the number of times that the Java regex pattern regexp is matched in the string str.

In [9]:
df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
df.select(regexp_count('str', lit(r'\d+')).alias('d')).collect()

[Row(d=3)]

In [10]:
df.select(regexp_count('str', lit(r'mmm')).alias('d')).collect()

[Row(d=0)]

In [11]:
df.select(regexp_count("str", col("regexp")).alias('d')).collect()

[Row(d=3)]

#### pyspark.sql.functions.regexp_extract(str: ColumnOrName, pattern: str, idx: int) → pyspark.sql.column.Column
Extract a specific group matched by the Java regex regexp, from the specified string column. If the regex did not match, or the specified group did not match, an empty string is returned.

In [12]:
df = spark.createDataFrame([('100-200',)], ['str'])
df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()

[Row(d='100')]

In [13]:
df = spark.createDataFrame([('foo',)], ['str'])
df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()

[Row(d='')]

In [14]:
df = spark.createDataFrame([('aaaac',)], ['str'])
df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()

[Row(d='')]

#### pyspark.sql.functions.regexp_extract_all(str: ColumnOrName, regexp: ColumnOrName, idx: Union[int, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column
Extract all strings in the str that match the Java regex regexp and corresponding to the regex group index.

In [15]:
df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")], ["str", "regexp"])
df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)')).alias('d')).collect()

[Row(d=['100', '300'])]

In [16]:
df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 1).alias('d')).collect()


[Row(d=['100', '300'])]

In [17]:
df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 2).alias('d')).collect()


[Row(d=['200', '400'])]

In [18]:
df.select(regexp_extract_all('str', col("regexp")).alias('d')).collect()

[Row(d=['100', '300'])]

#### pyspark.sql.functions.regexp_replace(string: ColumnOrName, pattern: Union[str, pyspark.sql.column.Column], replacement: Union[str, pyspark.sql.column.Column]) → pyspark.sql.column.Column
Replace all substrings of the specified string value that match regexp with replacement.

In [19]:
df = spark.createDataFrame([("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"])
df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()

[Row(d='-----')]

In [20]:
df.select(regexp_replace("str", col("pattern"), col("replacement")).alias('d')).collect()

[Row(d='-----')]

#### pyspark.sql.functions.regexp_substr(str: ColumnOrName, regexp: ColumnOrName) → pyspark.sql.column.Column
Returns the substring that matches the Java regex regexp within the string str. If the regular expression is not found, the result is null.

In [21]:
df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
df.select(regexp_substr('str', lit(r'\d+')).alias('d')).collect()

[Row(d='1')]

In [22]:
df.select(regexp_substr('str', lit(r'mmm')).alias('d')).collect()

[Row(d=None)]

In [23]:
df.select(regexp_substr("str", col("regexp")).alias('d')).collect()

[Row(d='1')]

#### pyspark.sql.functions.regexp_instr(str: ColumnOrName, regexp: ColumnOrName, idx: Union[int, pyspark.sql.column.Column, None] = None) → pyspark.sql.column.Column
Extract all strings in the str that match the Java regex regexp and corresponding to the regex group index.

In [24]:
df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str", "regexp"])
df.select(regexp_instr('str', lit(r'\d+(a|b|m)')).alias('d')).collect()

[Row(d=1)]

In [25]:
df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 1).alias('d')).collect()

[Row(d=1)]

In [26]:
df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 2).alias('d')).collect()

[Row(d=1)]

In [27]:
df.select(regexp_instr('str', col("regexp")).alias('d')).collect()

[Row(d=1)]

#### pyspark.sql.functions.replace(src: ColumnOrName, search: ColumnOrName, replace: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Replaces all occurrences of search with replace.

In [28]:
df = spark.createDataFrame([("ABCabc", "abc", "DEF",)], ["a", "b", "c"])
df.select(replace(df.a, df.b, df.c).alias('r')).collect()

[Row(r='ABCDEF')]

In [29]:
df.select(replace(df.a, df.b).alias('r')).collect()

[Row(r='ABC')]

#### pyspark.sql.functions.right(str: ColumnOrName, len: ColumnOrName) → pyspark.sql.column.Column¶
    Returns the rightmost len`(`len can be string type) characters from the string str, if len is less or equal than 0 the result is an empty string.

In [30]:
df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])
df.select(right(df.a, df.b).alias('r')).collect()

[Row(r='SQL')]

#### pyspark.sql.functions.ucase(str: ColumnOrName) → pyspark.sql.column.Column
Returns str with all characters changed to uppercase.

In [32]:
spark.range(1).select(ucase(lit("Spark"))).show()

+------------+
|ucase(Spark)|
+------------+
|       SPARK|
+------------+



#### pyspark.sql.functions.unbase64(col: ColumnOrName) → pyspark.sql.column.Column
Decodes a BASE64 encoded string column and returns it as a binary column.

In [33]:
df = spark.createDataFrame(["U3Bhcms=",
                            "UHlTcGFyaw==",
                            "UGFuZGFzIEFQSQ=="], "STRING")
df.select(unbase64("value")).show()

+--------------------+
|     unbase64(value)|
+--------------------+
|    [53 70 61 72 6B]|
|[50 79 53 70 61 7...|
|[50 61 6E 64 61 7...|
+--------------------+



#### pyspark.sql.functions.rpad(col: ColumnOrName, len: int, pad: str) → pyspark.sql.column.Column
Right-pad the string column to width len with pad.

In [34]:
df = spark.createDataFrame([('abcd',)], ['s',])
df.select(rpad(df.s, 6, '#').alias('s')).collect()

[Row(s='abcd##')]

#### pyspark.sql.functions.repeat(col: ColumnOrName, n: int) → pyspark.sql.column.Column
Repeats a string column n times, and returns it as a new string column.

In [35]:
df = spark.createDataFrame([('ab',)], ['s',])
df.select(repeat(df.s, 3).alias('s')).collect()

[Row(s='ababab')]

#### pyspark.sql.functions.rtrim(col: ColumnOrName) → pyspark.sql.column.Column
Trim the spaces from right end for the specified string value.

In [36]:
df = spark.createDataFrame(["   Spark", "Spark  ", " Spark"], "STRING")
df.select(rtrim("value").alias("r")).withColumn("length", length("r")).show()

+--------+------+
|       r|length|
+--------+------+
|   Spark|     8|
|   Spark|     5|
|   Spark|     6|
+--------+------+



#### pyspark.sql.functions.soundex(col: ColumnOrName) → pyspark.sql.column.Column
Returns the SoundEx encoding for a string

In [37]:
df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
df.select(soundex(df.name).alias("soundex")).collect()

[Row(soundex='P362'), Row(soundex='U612')]

#### pyspark.sql.functions.split(str: ColumnOrName, pattern: str, limit: int = - 1) → pyspark.sql.column.Column
Splits str around matches of the given pattern.

Parameters:
- str: Column or str | 
a string expression to split
- pattern: str | 
a string representing a regular expression. The regex string should be a Java regular expression.
- limit: int, optional | 
an integer which controls the number of times pattern is applied.
    * limit > 0: The resulting array’s length will not be more than limit, and the
resulting array’s last entry will contain all input beyond the last matched pattern.
    * limit <= 0: pattern will be applied as many times as possible, and the resulting
array can be of any size.

Returns – Column | 
array of separated strings.

In [38]:
df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
df.select(split(df.s, '[ABC]', 2).alias('s')).collect()

[Row(s=['one', 'twoBthreeC'])]

In [39]:
df.select(split(df.s, '[ABC]', -1).alias('s')).collect()

[Row(s=['one', 'two', 'three', ''])]

#### pyspark.sql.functions.split_part(src: ColumnOrName, delimiter: ColumnOrName, partNum: ColumnOrName) → pyspark.sql.column.Column
Splits str by delimiter and return requested part of the split (1-based). If any input is null, returns null. if partNum is out of range of split parts, returns empty string. If partNum is 0, throws an error. If partNum is negative, the parts are counted backward from the end of the string. If the delimiter is an empty string, the str is not split.

In [40]:
df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"])
df.select(split_part(df.a, df.b, df.c).alias('r')).collect()

[Row(r='13')]

#### pyspark.sql.functions.startswith(str: ColumnOrName, prefix: ColumnOrName) → pyspark.sql.column.Column
Returns a boolean. The value is True if str starts with prefix. Returns NULL if either input expression is NULL. Otherwise, returns False. Both str or prefix must be of STRING or BINARY type.

In [41]:
df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])
df.select(startswith(df.a, df.b).alias('r')).collect()

[Row(r=True)]

In [42]:
df = spark.createDataFrame([("414243", "4142",)], ["e", "f"])
df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))
df.printSchema()

root
 |-- e: binary (nullable = true)
 |-- f: binary (nullable = true)



In [43]:
df.select(startswith("e", "f"), startswith("f", "e")).show()

+----------------+----------------+
|startswith(e, f)|startswith(f, e)|
+----------------+----------------+
|            true|           false|
+----------------+----------------+



#### pyspark.sql.functions.substr(str: ColumnOrName, pos: ColumnOrName, len: Optional[ColumnOrName] = None) → pyspark.sql.column.Column¶
Returns the substring of str that starts at pos and is of length len, or the slice of byte array that starts at pos and is of length len.

In [44]:
spark.createDataFrame(
    [("Spark SQL", 5, 1,)], ["a", "b", "c"]
).select(substr("a", "b", "c")).show()

+---------------+
|substr(a, b, c)|
+---------------+
|              k|
+---------------+



In [45]:
spark.createDataFrame(
    [("Spark SQL", 5, 1,)], ["a", "b", "c"]
).select(substr("a", "b")).show()

+------------------------+
|substr(a, b, 2147483647)|
+------------------------+
|                   k SQL|
+------------------------+



#### pyspark.sql.functions.substring(str: ColumnOrName, pos: int, len: int) → pyspark.sql.column.Column
Substring starts at pos and is of length len when str is String type or returns the slice of byte array that starts at pos in byte and is of length len when str is Binary type.

##### Notes

The position is not zero based, but 1 based index.

In [46]:
df = spark.createDataFrame([('abcd',)], ['s',])
df.select(substring(df.s, 1, 2).alias('s')).collect()

[Row(s='ab')]

#### pyspark.sql.functions.substring_index(str: ColumnOrName, delim: str, count: int) → pyspark.sql.column.Column
Returns the substring from string str before count occurrences of the delimiter delim. If count is positive, everything the left of the final delimiter (counting from left) is returned. If count is negative, every to the right of the final delimiter (counting from the right) is returned. substring_index performs a case-sensitive match when searching for delim.

In [47]:
df = spark.createDataFrame([('a.b.c.d',)], ['s'])
df.select(substring_index(df.s, '.', 2).alias('s')).collect()

[Row(s='a.b')]

In [48]:
df.select(substring_index(df.s, '.', -3).alias('s')).collect()

[Row(s='b.c.d')]

#### pyspark.sql.functions.overlay(src: ColumnOrName, replace: ColumnOrName, pos: Union[ColumnOrName, int], len: Union[ColumnOrName, int] = - 1) → pyspark.sql.column.Column
Overlay the specified portion of src with replace, starting from byte position pos of src and proceeding for len bytes.

Parameters:
- src: Column or str | 
column name or column containing the string that will be replaced
- replace: Column or str | 
column name or column containing the substitution string
- pos: Column or str or int | 
column name, column, or int containing the starting position in src
- len: Column or str or int, optional | 
column name, column, or int containing the number of bytes to replace in src string by ‘replace’ defaults to -1, which represents the length of the ‘replace’ string

Returns – Column | 
string with replaced values.

In [49]:
df = spark.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y"))
df.select(overlay("x", "y", 7).alias("overlayed")).collect()

[Row(overlayed='SPARK_CORE')]

In [50]:
df.select(overlay("x", "y", 7, 0).alias("overlayed")).collect()

[Row(overlayed='SPARK_CORESQL')]

In [51]:
df.select(overlay("x", "y", 7, 2).alias("overlayed")).collect()

[Row(overlayed='SPARK_COREL')]

#### pyspark.sql.functions.sentences(string: ColumnOrName, language: Optional[ColumnOrName] = None, country: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Splits a string into arrays of sentences, where each sentence is an array of words. The ‘language’ and ‘country’ arguments are optional, and if omitted, the default locale is used.

In [53]:
df = spark.createDataFrame([["This is an example sentence."]], ["string"])
df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)

+-----------------------------------+
|sentences(string, en, US)          |
+-----------------------------------+
|[[This, is, an, example, sentence]]|
+-----------------------------------+



In [54]:
df = spark.createDataFrame([["Hello world. How are you?"]], ["s"])
df.select(sentences("s")).show(truncate=False)

+---------------------------------+
|sentences(s, , )                 |
+---------------------------------+
|[[Hello, world], [How, are, you]]|
+---------------------------------+



#### pyspark.sql.functions.to_binary(col: ColumnOrName, format: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
Converts the input col to a binary value based on the supplied format. The format can be a case-insensitive string literal of “hex”, “utf-8”, “utf8”, or “base64”. By default, the binary format for conversion is “hex” if format is omitted. The function returns NULL if at least one of the input parameters is NULL.

In [55]:
df = spark.createDataFrame([("abc",)], ["e"])
df.select(to_binary(df.e, lit("utf-8")).alias('r')).collect()

[Row(r=bytearray(b'abc'))]

In [56]:
df = spark.createDataFrame([("414243",)], ["e"])
df.select(to_binary(df.e).alias('r')).collect()

[Row(r=bytearray(b'ABC'))]

#### pyspark.sql.functions.to_char(col: ColumnOrName, format: ColumnOrName) → pyspark.sql.column.Column¶
Convert col to a string based on the format. 

Throws an exception if the conversion fails. The format can consist of the following characters, case insensitive: ‘0’ or ‘9’: Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the format string matches a sequence of digits in the input value, generating a result string of the same length as the corresponding sequence in the format string. The result string is left-padded with zeros if the 0/9 sequence comprises more digits than the matching part of the decimal value, starts with 0, and is before the decimal point. Otherwise, it is padded with spaces. ‘.’ or ‘D’: Specifies the position of the decimal point (optional, only allowed once). ‘,’ or ‘G’: Specifies the position of the grouping (thousands) separator (,). There must be a 0 or 9 to the left and right of each grouping separator. ‘′:𝑆𝑝𝑒𝑐𝑖𝑓𝑖𝑒𝑠𝑡ℎ𝑒𝑙𝑜𝑐𝑎𝑡𝑖𝑜𝑛𝑜𝑓𝑡ℎ𝑒
 currency sign. This character may only be specified once. ‘S’ or ‘MI’: Specifies the position of a ‘-‘ or ‘+’ sign (optional, only allowed once at the beginning or end of the format string). Note that ‘S’ prints ‘+’ for positive values but ‘MI’ prints a space. ‘PR’: Only allowed at the end of the format string; specifies that the result string will be wrapped by angle brackets if the input value is negative.

In [57]:
df = spark.createDataFrame([(78.12,)], ["e"])
df.select(to_char(df.e, lit("$99.99")).alias('r')).collect()

[Row(r='$78.12')]

#### pyspark.sql.functions.to_number(col: ColumnOrName, format: ColumnOrName) → pyspark.sql.column.Column¶
Convert string ‘col’ to a number based on the string format ‘format’.

Throws an exception if the conversion fails. The format can consist of the following characters, case insensitive: ‘0’ or ‘9’: Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the format string matches a sequence of digits in the input string. If the 0/9 sequence starts with 0 and is before the decimal point, it can only match a digit sequence of the same size. Otherwise, if the sequence starts with 9 or is after the decimal point, it can match a digit sequence that has the same or smaller size. ‘.’ or ‘D’: Specifies the position of the decimal point (optional, only allowed once). ‘,’ or ‘G’: Specifies the position of the grouping (thousands) separator (,). There must be a 0 or 9 to the left and right of each grouping separator. ‘col’ must match the grouping separator relevant for the size of the number. ‘′:𝑆𝑝𝑒𝑐𝑖𝑓𝑖𝑒𝑠𝑡ℎ𝑒𝑙𝑜𝑐𝑎𝑡𝑖𝑜𝑛𝑜𝑓𝑡ℎ𝑒
 currency sign. This character may only be specified once. ‘S’ or ‘MI’: Specifies the position of a ‘-‘ or ‘+’ sign (optional, only allowed once at the beginning or end of the format string). Note that ‘S’ allows ‘-‘ but ‘MI’ does not. ‘PR’: Only allowed at the end of the format string; specifies that ‘col’ indicates a negative number with wrapping angled brackets.



In [58]:
df = spark.createDataFrame([("$78.12",)], ["e"])
df.select(to_number(df.e, lit("$99.99")).alias('r')).collect()

[Row(r=Decimal('78.12'))]

#### pyspark.sql.functions.to_varchar(col: ColumnOrName, format: ColumnOrName) → pyspark.sql.column.Column
Convert col to a string based on the format.

Throws an exception if the conversion fails. The format can consist of the following characters, case insensitive: ‘0’ or ‘9’: Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the format string matches a sequence of digits in the input value, generating a result string of the same length as the corresponding sequence in the format string. The result string is left-padded with zeros if the 0/9 sequence comprises more digits than the matching part of the decimal value, starts with 0, and is before the decimal point. Otherwise, it is padded with spaces. ‘.’ or ‘D’: Specifies the position of the decimal point (optional, only allowed once). ‘,’ or ‘G’: Specifies the position of the grouping (thousands) separator (,). There must be a 0 or 9 to the left and right of each grouping separator. ‘′:𝑆𝑝𝑒𝑐𝑖𝑓𝑖𝑒𝑠𝑡ℎ𝑒𝑙𝑜𝑐𝑎𝑡𝑖𝑜𝑛𝑜𝑓𝑡ℎ𝑒
 currency sign. This character may only be specified once. ‘S’ or ‘MI’: Specifies the position of a ‘-‘ or ‘+’ sign (optional, only allowed once at the beginning or end of the format string). Note that ‘S’ prints ‘+’ for positive values but ‘MI’ prints a space. ‘PR’: Only allowed at the end of the format string; specifies that the result string will be wrapped by angle brackets if the input value is negative.

In [59]:
df = spark.createDataFrame([(78.12,)], ["e"])
df.select(to_varchar(df.e, lit("$99.99")).alias('r')).collect()

[Row(r='$78.12')]

#### pyspark.sql.functions.translate(srcCol: ColumnOrName, matching: str, replace: str) → pyspark.sql.column.Column
A function translate any character in the srcCol by a character in matching. The characters in replace is corresponding to the characters in matching. Translation will happen whenever any character in the string is matching with the character in the matching.

In [60]:
spark.createDataFrame([('translate',)], ['a']) \
     .select(translate('a', "rnlt", "123") \
     .alias('r')).collect()

[Row(r='1a2s3ae')]

#### pyspark.sql.functions.trim(col: ColumnOrName) → pyspark.sql.column.Column
Trim the spaces from both ends for the specified string column.

In [61]:
df = spark.createDataFrame(["   Spark", "Spark  ", " Spark"], "STRING")
df.select(trim("value").alias("r")).withColumn("length", length("r")).show()

+-----+------+
|    r|length|
+-----+------+
|Spark|     5|
|Spark|     5|
|Spark|     5|
+-----+------+



#### pyspark.sql.functions.upper(col: ColumnOrName) → pyspark.sql.column.Column
Converts a string expression to upper case.

In [62]:
df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")
df.select(upper("value")).show()

+------------+
|upper(value)|
+------------+
|       SPARK|
|     PYSPARK|
|  PANDAS API|
+------------+



In [64]:
tour.limit(10).select("Player name", upper("Player name")).show()

+---------------+------------------+
|    Player name|upper(Player name)|
+---------------+------------------+
|Robert Garrigus|   ROBERT GARRIGUS|
|   Bubba Watson|      BUBBA WATSON|
| Dustin Johnson|    DUSTIN JOHNSON|
|Brett Wetterich|   BRETT WETTERICH|
|    J.B. Holmes|       J.B. HOLMES|
|      John Daly|         JOHN DALY|
|  Graham DeLaet|     GRAHAM DELAET|
|  Angel Cabrera|     ANGEL CABRERA|
| Charles Warren|    CHARLES WARREN|
|    D.J. Trahan|       D.J. TRAHAN|
+---------------+------------------+



#### pyspark.sql.functions.url_decode(str: ColumnOrName) → pyspark.sql.column.Column
Decodes a str in ‘application/x-www-form-urlencoded’ format using a specific encoding scheme.

In [65]:
df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["a"])
df.select(url_decode(df.a).alias('r')).collect()

[Row(r='https://spark.apache.org')]

#### pyspark.sql.functions.url_encode(str: ColumnOrName) → pyspark.sql.column.Column
Translates a string into ‘application/x-www-form-urlencoded’ format using a specific encoding scheme.

In [66]:
df = spark.createDataFrame([("https://spark.apache.org",)], ["a"])
df.select(url_encode(df.a).alias('r')).collect()

[Row(r='https%3A%2F%2Fspark.apache.org')]

## [Bitwise Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#bitwise-functions)

#### pyspark.sql.functions.bit_count(col: ColumnOrName) → pyspark.sql.column.Column
Returns the number of bits that are set in the argument expr as an unsigned 64-bit integer, or NULL if the argument is NULL.

In [177]:
df = spark.createDataFrame([[1],[1],[2]], ["c"])
df.select(bit_count("c")).show()
# Returns Column | the number of bits that are set in the argument expr as
# an unsigned 64-bit integer, or NULL if the argument is NULL.

+------------+
|bit_count(c)|
+------------+
|           1|
|           1|
|           1|
+------------+



#### pyspark.sql.functions.bit_get(col: ColumnOrName, pos: ColumnOrName) → pyspark.sql.column.Column
Returns the value of the bit (0 or 1) at the specified position. The positions are numbered from right to left, starting at zero. The position argument cannot be negative.

In [180]:
df = spark.createDataFrame([[1],[1],[2],[3],[4],[5],[6],], ["c"])
df.select(bit_get("c", lit(2)).alias("2"), bit_get("c", lit(1)).alias("1"), bit_get("c", lit(0)).alias("0")).show()

+---+---+---+
|  2|  1|  0|
+---+---+---+
|  0|  0|  1|
|  0|  0|  1|
|  0|  1|  0|
|  0|  1|  1|
|  1|  0|  0|
|  1|  0|  1|
|  1|  1|  0|
+---+---+---+



#### pyspark.sql.functions.getbit(col: ColumnOrName, pos: ColumnOrName) → pyspark.sql.column.Column
Returns the value of the bit (0 or 1) at the specified position. The positions are numbered from right to left, starting at zero. The position argument cannot be negative.

In [181]:
import pyspark.sql.functions as sf
spark.createDataFrame(
    [[1], [1], [2]], ["c"]
).select(sf.getbit("c", sf.lit(1))).show()

+------------+
|getbit(c, 1)|
+------------+
|           0|
|           0|
|           1|
+------------+



## [Call Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#call-functions)

#### pyspark.sql.functions.call_function(funcName: str, *cols: ColumnOrName) → pyspark.sql.column.Column
Call a SQL function.

`print(call_function.__doc__)`

    Call a SQL function.

    .. versionadded:: 3.5.0

    Parameters
    ----------
    funcName : str
        function name that follows the SQL identifier syntax (can be quoted, can be qualified)
    cols : :class:`~pyspark.sql.Column` or str
        column names or :class:`~pyspark.sql.Column`\s to be used in the function

    Returns
    -------
    :class:`~pyspark.sql.Column`
        result of executed function.

    Examples
    --------
    >>> from pyspark.sql.functions import call_udf, col
    >>> from pyspark.sql.types import IntegerType, StringType
    >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
    >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
    >>> df.select(call_function("intX2", "id")).show()
    +---------+
    |intX2(id)|
    +---------+
    |        2|
    |        4|
    |        6|
    +---------+
    >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())
    >>> df.select(call_function("strX2", col("name"))).show()
    +-----------+
    |strX2(name)|
    +-----------+
    |         aa|
    |         bb|
    |         cc|
    +-----------+
    >>> df.select(call_function("avg", col("id"))).show()
    +-------+
    |avg(id)|
    +-------+
    |    2.0|
    +-------+
    >>> _ = spark.sql("CREATE FUNCTION custom_avg AS 'test.org.apache.spark.sql.MyDoubleAvg'")
    ... # doctest: +SKIP
    >>> df.select(call_function("custom_avg", col("id"))).show()
    ... # doctest: +SKIP
    +------------------------------------+
    |spark_catalog.default.custom_avg(id)|
    +------------------------------------+
    |                               102.0|
    +------------------------------------+
    >>> df.select(call_function("spark_catalog.default.custom_avg", col("id"))).show()
    ... # doctest: +SKIP
    +------------------------------------+
    |spark_catalog.default.custom_avg(id)|
    +------------------------------------+
    |                               102.0|
    +------------------------------------+
    

In [3]:
df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
_ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
df.select(call_function("intX2", "id")).show()

                                                                                

+---------+
|intX2(id)|
+---------+
|        2|
|        4|
|        6|
+---------+



In [4]:
_ = spark.udf.register("strX2", lambda s: s * 2, StringType())
df.select(call_function("strX2", col("name"))).show()

+-----------+
|strX2(name)|
+-----------+
|         aa|
|         bb|
|         cc|
+-----------+



In [5]:
df.select(call_function("avg", col("id"))).show()

+-------+
|avg(id)|
+-------+
|    2.0|
+-------+



In [6]:
# _ = spark.sql("CREATE FUNCTION custom_avg AS 'test.org.apache.spark.sql.MyDoubleAvg'")
# df.select(call_function("custom_avg", col("id"))).show()
# df.select(call_function("spark_catalog.default.custom_avg", col("id"))).show()

#### pyspark.sql.functions.call_udf(udfName: str, *cols: ColumnOrName) → pyspark.sql.column.Column
Call an user-defined function.

`print(call_udf.__doc__)`

    Call an user-defined function.

    .. versionadded:: 3.4.0

    Parameters
    ----------
    udfName : str
        name of the user defined function (UDF)
    cols : :class:`~pyspark.sql.Column` or str
        column names or :class:`~pyspark.sql.Column`\s to be used in the UDF

    Returns
    -------
    :class:`~pyspark.sql.Column`
        result of executed udf.

    Examples
    --------
    >>> from pyspark.sql.functions import call_udf, col
    >>> from pyspark.sql.types import IntegerType, StringType
    >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])
    >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())
    >>> df.select(call_udf("intX2", "id")).show()
    +---------+
    |intX2(id)|
    +---------+
    |        2|
    |        4|
    |        6|
    +---------+
    >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())
    >>> df.select(call_udf("strX2", col("name"))).show()
    +-----------+
    |strX2(name)|
    +-----------+
    |         aa|
    |         bb|
    |         cc|
    +-----------+

In [95]:
_ = spark.udf.register("intX1.5", lambda i: i * 1.5, IntegerType())
students.limit(10).select(call_udf("intX2", "math score")).show()

+-----------------+
|intX2(math score)|
+-----------------+
|              144|
|              138|
|              180|
|               94|
|              152|
|              142|
|              176|
|               80|
|              128|
|               76|
+-----------------+



In [101]:
_ = spark.udf.register("strX2", lambda s: s * 2, StringType())
tour.limit(5).select(call_udf("strX2", col("Player name"))).show(truncate=False)

23/10/22 16:03:15 WARN SimpleFunctionRegistry: The function strx2 replaced a previously registered function.


+------------------------------+
|strX2(Player name)            |
+------------------------------+
|Robert GarrigusRobert Garrigus|
|Bubba WatsonBubba Watson      |
|Dustin JohnsonDustin Johnson  |
|Brett WetterichBrett Wetterich|
|J.B. HolmesJ.B. Holmes        |
+------------------------------+



#### pyspark.sql.functions.pandas_udf(f=None, returnType=None, functionType=None)
Creates a pandas user defined function (a.k.a. vectorized user defined function).

Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer data and Pandas to work with the data, which allows vectorized operations. A Pandas UDF is defined using the pandas_udf as a decorator or to wrap the function, and no additional configuration is required. A Pandas UDF behaves as a regular PySpark function API in general.

Parameters:
- f: function, optional |
user-defined function. A python function if used as a standalone function
- return: Typepyspark.sql.types.DataType or str, optional | 
the return type of the user-defined function. The value can be either a pyspark.sql.types.DataType object or a DDL-formatted type string.
- function: Typeint, optional | 
an enum value in pyspark.sql.functions.PandasUDFType. Default: SCALAR. This parameter exists for compatibility. Using Python type hints is encouraged.

[why] Get same errors with all code snippets. Probably unmatching versions of libraries

In [15]:
import pandas as pd

@pandas_udf(returnType=IntegerType())
def slen(s: pd.Series) -> pd.Series:
    return s.str.len()

# _ = spark.udf.register("slen", slen, IntegerType())

df = students.select("gender")
df = df.withColumn("Name_Length", slen(df["gender"])) # .show()
df.show()

23/10/25 12:59:32 ERROR ArrowPythonRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/zsavchenko/.local/share/virtualenvs/spark_env-J_TJEM2Z/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1225, in main
    eval_type = read_int(infile)
                ^^^^^^^^^^^^^^^^
  File "/Users/zsavchenko/.local/share/virtualenvs/spark_env-J_TJEM2Z/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 596, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:118)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	

Py4JJavaError: An error occurred while calling o267.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 17.0 failed 1 times, most recent failure: Lost task 0.0 in stage 17.0 (TID 37) (192.168.0.179 executor driver): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	at org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	at org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	at org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	at org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	at org.apache.arrow.vector.ipc.ArrowWriter.writeRecordBatch(ArrowWriter.java:147)
	at org.apache.arrow.vector.ipc.ArrowWriter.writeBatch(ArrowWriter.java:133)
	at org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeIteratorToArrowStream(PythonArrowInput.scala:140)
	at org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeIteratorToArrowStream$(PythonArrowInput.scala:124)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner.writeIteratorToArrowStream(ArrowPythonRunner.scala:30)
	at org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.$anonfun$writeIteratorToStream$1(PythonArrowInput.scala:96)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeIteratorToStream(PythonArrowInput.scala:102)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
	at org.apache.arrow.memory.util.MemoryUtil.directBuffer(MemoryUtil.java:174)
	at org.apache.arrow.memory.ArrowBuf.getDirectBuffer(ArrowBuf.java:229)
	at org.apache.arrow.memory.ArrowBuf.nioBuffer(ArrowBuf.java:224)
	at org.apache.arrow.vector.ipc.WriteChannel.write(WriteChannel.java:133)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.writeBatchBuffers(MessageSerializer.java:303)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(MessageSerializer.java:276)
	at org.apache.arrow.vector.ipc.ArrowWriter.writeRecordBatch(ArrowWriter.java:147)
	at org.apache.arrow.vector.ipc.ArrowWriter.writeBatch(ArrowWriter.java:133)
	at org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeIteratorToArrowStream(PythonArrowInput.scala:140)
	at org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeIteratorToArrowStream$(PythonArrowInput.scala:124)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner.writeIteratorToArrowStream(ArrowPythonRunner.scala:30)
	at org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.$anonfun$writeIteratorToStream$1(PythonArrowInput.scala:96)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeIteratorToStream(PythonArrowInput.scala:102)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)


In [126]:
from pyspark.sql.functions import PandasUDFType
from pyspark.sql.types import IntegerType
@pandas_udf(IntegerType(), PandasUDFType.SCALAR)
def slen(s):
    return s.str.len()

In [127]:
@pandas_udf("col1 string, col2 long")
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
    s3['col2'] = s1 + s2.str.len()
    return s3

# Create a Spark DataFrame that has three columns including a struct column.
df = spark.createDataFrame(
    [[1, "a string", ("a nested string",)]],
    "long_col long, string_col string, struct_col struct<col1:string>")
df.printSchema()

root
 |-- long_col: long (nullable = true)
 |-- string_col: string (nullable = true)
 |-- struct_col: struct (nullable = true)
 |    |-- col1: string (nullable = true)



In [128]:
df.select(func("long_col", "string_col", "struct_col")).printSchema()

root
 |-- func(long_col, string_col, struct_col): struct (nullable = true)
 |    |-- col1: string (nullable = true)
 |    |-- col2: long (nullable = true)



In [130]:
"""
@pandas_udf("string")
def to_upper(s: pd.Series) -> pd.Series:
    return s.str.upper()

df = spark.createDataFrame([("John Doe",)], ("name",))
df.select(to_upper("name")).show()
"""

'\n@pandas_udf("string")\ndef to_upper(s: pd.Series) -> pd.Series:\n    return s.str.upper()\n\ndf = spark.createDataFrame([("John Doe",)], ("name",))\ndf.select(to_upper("name")).show()\n'

In [132]:
"""
@pandas_udf("first string, last string")
def split_expand(s: pd.Series) -> pd.DataFrame:
    return s.str.split(expand=True)

df = spark.createDataFrame([("John Doe",)], ("name",))
df.select(split_expand("name")).show()
"""

'\n@pandas_udf("first string, last string")\ndef split_expand(s: pd.Series) -> pd.DataFrame:\n    return s.str.split(expand=True)\n\ndf = spark.createDataFrame([("John Doe",)], ("name",))\ndf.select(split_expand("name")).show()\n'

In [136]:
"""
from typing import Iterator

@pandas_udf("long")
def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Do some expensive initialization with a state
    state = very_expensive_initialization()
    for x in iterator:
        # Use that state for whole iterator.
        yield calculate_with_state(x, state)

df.select(calculate("value")).show()
"""

'\nfrom typing import Iterator\n\n@pandas_udf("long")\ndef calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:\n    # Do some expensive initialization with a state\n    state = very_expensive_initialization()\n    for x in iterator:\n        # Use that state for whole iterator.\n        yield calculate_with_state(x, state)\n\ndf.select(calculate("value")).show()\n'

In [138]:
"""
from typing import Iterator
@pandas_udf("long")
def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    for s in iterator:
        yield s + 1

df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
df.select(plus_one(df.v)).show()
"""

'\nfrom typing import Iterator\n@pandas_udf("long")\ndef plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:\n    for s in iterator:\n        yield s + 1\n\ndf = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))\ndf.select(plus_one(df.v)).show()\n'

In [140]:
"""
from typing import Iterator, Tuple
from pyspark.sql.functions import struct, col
@pandas_udf("long")
def multiply(iterator: Iterator[Tuple[pd.Series, pd.DataFrame]]) -> Iterator[pd.Series]:
    for s1, df in iterator:
        yield s1 * df.v

df = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))
df.withColumn('output', multiply(col("v"), struct(col("v")))).show()
"""

'\nfrom typing import Iterator, Tuple\nfrom pyspark.sql.functions import struct, col\n@pandas_udf("long")\ndef multiply(iterator: Iterator[Tuple[pd.Series, pd.DataFrame]]) -> Iterator[pd.Series]:\n    for s1, df in iterator:\n        yield s1 * df.v\n\ndf = spark.createDataFrame(pd.DataFrame([1, 2, 3], columns=["v"]))\ndf.withColumn(\'output\', multiply(col("v"), struct(col("v")))).show()\n'

In [142]:
"""
@pandas_udf("double")
def mean_udf(v: pd.Series) -> float:
    return v.mean()

df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
df.groupby("id").agg(mean_udf(df['v'])).show()
"""

'\n@pandas_udf("double")\ndef mean_udf(v: pd.Series) -> float:\n    return v.mean()\n\ndf = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))\ndf.groupby("id").agg(mean_udf(df[\'v\'])).show()\n'

In [144]:
"""
from pyspark.sql import Window
@pandas_udf("double")
def mean_udf(v: pd.Series) -> float:
    return v.mean()

df = spark.createDataFrame(
    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
w = Window.partitionBy('id').orderBy('v').rowsBetween(-1, 0)
df.withColumn('mean_v', mean_udf("v").over(w)).show()
"""

'\nfrom pyspark.sql import Window\n@pandas_udf("double")\ndef mean_udf(v: pd.Series) -> float:\n    return v.mean()\n\ndf = spark.createDataFrame(\n    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))\nw = Window.partitionBy(\'id\').orderBy(\'v\').rowsBetween(-1, 0)\ndf.withColumn(\'mean_v\', mean_udf("v").over(w)).show()\n'

In [146]:
print(pandas_udf.__doc__)


    Creates a pandas user defined function (a.k.a. vectorized user defined function).

    Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer
    data and Pandas to work with the data, which allows vectorized operations. A Pandas UDF
    is defined using the `pandas_udf` as a decorator or to wrap the function, and no
    additional configuration is required. A Pandas UDF behaves as a regular PySpark function
    API in general.

    .. versionadded:: 2.3.0

    .. versionchanged:: 3.4.0
        Supports Spark Connect.

    Parameters
    ----------
    f : function, optional
        user-defined function. A python function if used as a standalone function
    returnType : :class:`pyspark.sql.types.DataType` or str, optional
        the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
    functionType : int, optional
        an enum value in :clas

#### pyspark.sql.functions.udf(f: Union[Callable[[…], Any], DataTypeOrString, None] = None, returnType: DataTypeOrString = StringType(), *, useArrow: Optional[bool] = None) → Union[UserDefinedFunctionLike, Callable[[Callable[[…], Any]], UserDefinedFunctionLike]]
Creates a user defined function (UDF).

Parameters: 
- f: function |
python function if used as a standalone function
- return: Typepyspark.sql.types.DataType or str | 
the return type of the user-defined function. The value can be either a pyspark.sql.types.DataType object or a DDL-formatted type string.
- use: Arrowbool or None | 
whether to use Arrow to optimize the (de)serialization. When it is None, the Spark config “spark.sql.execution.pythonUDF.arrow.enabled” takes effect.

In [147]:
import random
random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()

In [149]:
students.limit(5).withColumn("Random int", random_udf()).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+----------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|Random int|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+----------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|         2|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|        69|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|        38|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|         3|
|  male|       group C|    

In [148]:
from pyspark.sql.types import IntegerType
slen = udf(lambda s: len(s), IntegerType())
@udf
def to_upper(s):
    if s is not None:
        return s.upper()

@udf(returnType=IntegerType())
def add_one(x):
    if x is not None:
        return x + 1

df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()

+----------+--------------+------------+
|slen(name)|to_upper(name)|add_one(age)|
+----------+--------------+------------+
|         8|      JOHN DOE|          22|
+----------+--------------+------------+



#### pyspark.sql.functions.udtf(cls: Optional[Type] = None, *, returnType: Union[pyspark.sql.types.StructType, str], useArrow: Optional[bool] = None) → Union[pyspark.sql.udtf.UserDefinedTableFunction, Callable[[Type], pyspark.sql.udtf.UserDefinedTableFunction]]
Creates a user defined table function (UDTF).

Parameters:
- cls: class | 
the Python user-defined table function handler class.
- return: Typepyspark.sql.types.StructType or str | 
the return type of the user-defined table function. The value can be either a pyspark.sql.types.StructType object or a DDL-formatted struct type string.
- use: Arrowbool or None, optional | 
whether to use Arrow to optimize the (de)serializations. When it’s set to None, the Spark config “spark.sql.execution.pythonUDTF.arrow.enabled” is used.

In [166]:
class PlusOne:
    def eval(self, a: int):
        yield a + 1,
plus_one = udtf(PlusOne, returnType="r: int").asDeterministic()

In [167]:
class TestUDTF:
    def eval(self, *args: Any):
        yield "hello", "world"

test_udtf = udtf(TestUDTF, returnType="c1: string, c2: string")
test_udtf().show()

+-----+-----+
|   c1|   c2|
+-----+-----+
|hello|world|
+-----+-----+



In [168]:
@udtf(returnType="c1: int, c2: int")
class PlusOne:
    def eval(self, x: int):
        yield x, x + 1

from pyspark.sql.functions import lit
PlusOne(lit(1)).show()

+---+---+
| c1| c2|
+---+---+
|  1|  2|
+---+---+



In [173]:
@udtf(returnType="c1: int, c2: int", useArrow=True)
class ArrowPlusOne:
    def eval(self, x: int):
        yield x, x + 1

# eval_type = read_int(infile) Error
# ArrowPlusOne(lit(1)).show()

#### pyspark.sql.functions.unwrap_udt(col: ColumnOrName) → pyspark.sql.column.Column
Unwrap UDT data type column into its underlying type.

In [165]:
"""
class MyUDT:
    def __init__(self, value):
        self.value = value

data = [(MyUDT(42),)]
schema = StructType([StructField("my_udt", MyUDTType(), True)])
df = spark.createDataFrame(data, schema)
"""
True

True

In [None]:
"""
result_df = df.select(unwrap_udt(col("my_udt")).alias("my_udt_value"))
result_df.show()
"""

In [164]:
"""
from pyspark.sql.functions import col
from pyspark.sql.types import UserDefinedType

# Визначаємо користувацький тип даних MyUDT
class MyUDT(UserDefinedType):

    def __init__(self, value = None):
        self.value = value
        
    def simpleString(self):
        return "MyUDT"

    def serialize(self, obj):
        return str(obj).encode('utf-8')

    def deserialize(self, datum):
        return MyUDT(int(datum.decode('utf-8')))

# Оголошуємо функцію unwrap_udt з використанням користувацького типу MyUDT
@pandas_udf(MyUDT())
def unwrap_udt(s: pd.Series) -> pd.Series:
    return s

# Створюємо DataFrame
data = [(MyUDT(42),), (MyUDT(56),)]
df = spark.createDataFrame(data, ["my_udt"])

# Використовуємо unwrap_udt для розгортання користувацького типу MyUDT
result_df = df.select(unwrap_udt(col("my_udt")).alias("my_udt_value"))

# Виводимо результат
result_df.show()
"""
True

True

## [Misc Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#misc-functions)

#### pyspark.sql.functions.aes_decrypt(input: ColumnOrName, key: ColumnOrName, mode: Optional[ColumnOrName] = None, padding: Optional[ColumnOrName] = None, aad: Optional[ColumnOrName] = None) → pyspark.sql.column.Column

##### The Advanced Encryption Standard (AES) 
Returns a decrypted value of input using AES in mode with padding. Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (mode, padding) are (‘ECB’, ‘PKCS’), (‘GCM’, ‘NONE’) and (‘CBC’, ‘PKCS’). Optional additional authenticated data (AAD) is only supported for GCM. If provided for encryption, the identical AAD value must be provided for decryption. The default mode is GCM.

Parameters:
- input: Column or str | The binary value to decrypt.
- key: Column or str | The passphrase to use to decrypt the data.
- mode: Column or str, optional | Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB, GCM, CBC.
- padding: Column or str, optional | Specifies how to pad messages whose length is not a multiple of the block size. Valid values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS for CBC.
- aad: Column or str, optional | Optional additional authenticated data. Only supported for GCM mode. This can be any free-form input and must be provided for both encryption and decryption.

In [7]:
df = spark.createDataFrame([(
    "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4",
    "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",
    "This is an AAD mixed into the input",)],
    ["input", "key", "mode", "padding", "aad"]
)
df.select(aes_decrypt(
    unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias('r')
).collect()

[Row(r=bytearray(b'Spark'))]

In [8]:
df = spark.createDataFrame([(
    "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=",
    "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)],
    ["input", "key", "mode", "padding"]
)
df.select(aes_decrypt(
    unbase64(df.input), df.key, df.mode, df.padding).alias('r')
).collect()

[Row(r=bytearray(b'Spark'))]

In [9]:
df.select(aes_decrypt(unbase64(df.input), df.key, df.mode).alias('r')).collect()

[Row(r=bytearray(b'Spark'))]

In [10]:
df = spark.createDataFrame([(
    "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",
    "0000111122223333",)],
    ["input", "key"]
)
df.select(aes_decrypt(unhex(df.input), df.key).alias('r')).collect()

[Row(r=bytearray(b'Spark'))]

#### pyspark.sql.functions.aes_encrypt(input: ColumnOrName, key: ColumnOrName, mode: Optional[ColumnOrName] = None, padding: Optional[ColumnOrName] = None, iv: Optional[ColumnOrName] = None, aad: Optional[ColumnOrName] = None) → pyspark.sql.column.Column

##### The Advanced Encryption Standard (AES) 
Returns an encrypted value of input using AES in given mode with the specified padding. Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (mode, padding) are (‘ECB’, ‘PKCS’), (‘GCM’, ‘NONE’) and (‘CBC’, ‘PKCS’). Optional initialization vectors (IVs) are only supported for CBC and GCM modes. These must be 16 bytes for CBC and 12 bytes for GCM. If not provided, a random vector will be generated and prepended to the output. Optional additional authenticated data (AAD) is only supported for GCM. If provided for encryption, the identical AAD value must be provided for decryption. The default mode is GCM.

Parameters: 
- input: Column or str | The binary value to encrypt.
- key: Column or str | The passphrase to use to encrypt the data.
- mode: Column or str, optional | Specifies which block cipher mode should be used to encrypt messages. Valid modes: ECB, GCM, CBC.
- padding: Column or str, optional | Specifies how to pad messages whose length is not a multiple of the block size. Valid values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS for CBC.
- iv: Column or str, optional | Optional initialization vector. Only supported for CBC and GCM modes. Valid values: None or “”. 16-byte array for CBC mode. 12-byte array for GCM mode.
- aad: Column or str, optional | Optional additional authenticated data. Only supported for GCM mode. This can be any free-form input and must be provided for both encryption and decryption.

In [11]:
df = spark.createDataFrame([(
    "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",
    "000000000000000000000000", "This is an AAD mixed into the input",)],
    ["input", "key", "mode", "padding", "iv", "aad"]
)
df.select(base64(aes_encrypt(
    df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex")), df.aad)
).alias('r')).collect()

[Row(r='AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4')]

In [12]:
df.select(base64(aes_encrypt(
    df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex")))
).alias('r')).collect()

[Row(r='AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f')]

In [13]:
df = spark.createDataFrame([(
    "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)],
    ["input", "key", "mode", "padding"]
)
df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode, df.padding),
    df.key, df.mode, df.padding).alias('r')
).collect()

[Row(r=bytearray(b'Spark SQL'))]

In [14]:
df = spark.createDataFrame([(
    "Spark SQL", "0000111122223333", "ECB",)],
    ["input", "key", "mode"]
)
df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode),
    df.key, df.mode).alias('r')
).collect()

[Row(r=bytearray(b'Spark SQL'))]

In [15]:
df = spark.createDataFrame([(
    "Spark SQL", "abcdefghijklmnop",)],
    ["input", "key"]
)
df.select(aes_decrypt(
    unbase64(base64(aes_encrypt(df.input, df.key))), df.key
).cast("STRING").alias('r')).collect()

[Row(r='Spark SQL')]

In [16]:
key_1 = "0000111122223333"
key_2 = "abcdefghijklmnop"

df = spark.createDataFrame([(
    "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",
    key_1,)],
    ["input", "key"]
)
df.select(aes_decrypt(unhex(df.input), df.key).alias('r')).collect()

[Row(r=bytearray(b'Spark'))]

In [17]:
df = spark.createDataFrame([(
    "Spark SQL", key_2,)],
    ["input", "key"]
)

df_key_2_en = df.select(aes_encrypt(df.input, df.key).cast("STRING").alias('r'))
df_key_2_en.collect()

[Row(r='!\x196�:���\x7f�4�]��hT�5\x7fshm<\x0b4wO\x06�\r��\x10�1')]

In [18]:
df_key_2_de = df_key_2_en.select(aes_decrypt("r", lit(key_2)).cast("STRING").alias('r'))
df_key_2_de.collect()

[Row(r='Spark SQL')]

#### pyspark.sql.functions.bitmap_bit_position(col: ColumnOrName) → pyspark.sql.column.Column
Returns the bit position for the given input column.

In [19]:
df = spark.createDataFrame([(123,)], ["a"])
df.select(bitmap_bit_position(df.a).alias("r")).collect()

[Row(r=122)]

In [20]:
tour.limit(20).select("Value", bitmap_bit_position("Value")).collect()

[Row(Value='71', bitmap_bit_position(Value)=70),
 Row(Value='77', bitmap_bit_position(Value)=76),
 Row(Value='83', bitmap_bit_position(Value)=82),
 Row(Value='54', bitmap_bit_position(Value)=53),
 Row(Value='100', bitmap_bit_position(Value)=99),
 Row(Value='63', bitmap_bit_position(Value)=62),
 Row(Value='88', bitmap_bit_position(Value)=87),
 Row(Value='64', bitmap_bit_position(Value)=63),
 Row(Value='64', bitmap_bit_position(Value)=63),
 Row(Value='92', bitmap_bit_position(Value)=91),
 Row(Value='75', bitmap_bit_position(Value)=74),
 Row(Value='54', bitmap_bit_position(Value)=53),
 Row(Value='76', bitmap_bit_position(Value)=75),
 Row(Value='94', bitmap_bit_position(Value)=93),
 Row(Value='82', bitmap_bit_position(Value)=81),
 Row(Value='85', bitmap_bit_position(Value)=84),
 Row(Value='79', bitmap_bit_position(Value)=78),
 Row(Value='89', bitmap_bit_position(Value)=88),
 Row(Value='88', bitmap_bit_position(Value)=87),
 Row(Value='91', bitmap_bit_position(Value)=90)]

In [21]:
tour.limit(20).select(bitmap_bit_position(tour.Value)).collect()

[Row(bitmap_bit_position(Value)=70),
 Row(bitmap_bit_position(Value)=76),
 Row(bitmap_bit_position(Value)=82),
 Row(bitmap_bit_position(Value)=53),
 Row(bitmap_bit_position(Value)=99),
 Row(bitmap_bit_position(Value)=62),
 Row(bitmap_bit_position(Value)=87),
 Row(bitmap_bit_position(Value)=63),
 Row(bitmap_bit_position(Value)=63),
 Row(bitmap_bit_position(Value)=91),
 Row(bitmap_bit_position(Value)=74),
 Row(bitmap_bit_position(Value)=53),
 Row(bitmap_bit_position(Value)=75),
 Row(bitmap_bit_position(Value)=93),
 Row(bitmap_bit_position(Value)=81),
 Row(bitmap_bit_position(Value)=84),
 Row(bitmap_bit_position(Value)=78),
 Row(bitmap_bit_position(Value)=88),
 Row(bitmap_bit_position(Value)=87),
 Row(bitmap_bit_position(Value)=90)]

#### pyspark.sql.functions.bitmap_bucket_number(col: ColumnOrName) → pyspark.sql.column.Column
Returns the bucket number for the given input column.

In [22]:
df = spark.createDataFrame([(123,)], ["a"])
df.select(bitmap_bucket_number(df.a).alias("r")).collect()

23/10/22 15:48:32 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


[Row(r=1)]

In [23]:
tour.limit(20).select("Value", bitmap_bucket_number("Value")).collect()

[Row(Value='71', bitmap_bucket_number(Value)=1),
 Row(Value='77', bitmap_bucket_number(Value)=1),
 Row(Value='83', bitmap_bucket_number(Value)=1),
 Row(Value='54', bitmap_bucket_number(Value)=1),
 Row(Value='100', bitmap_bucket_number(Value)=1),
 Row(Value='63', bitmap_bucket_number(Value)=1),
 Row(Value='88', bitmap_bucket_number(Value)=1),
 Row(Value='64', bitmap_bucket_number(Value)=1),
 Row(Value='64', bitmap_bucket_number(Value)=1),
 Row(Value='92', bitmap_bucket_number(Value)=1),
 Row(Value='75', bitmap_bucket_number(Value)=1),
 Row(Value='54', bitmap_bucket_number(Value)=1),
 Row(Value='76', bitmap_bucket_number(Value)=1),
 Row(Value='94', bitmap_bucket_number(Value)=1),
 Row(Value='82', bitmap_bucket_number(Value)=1),
 Row(Value='85', bitmap_bucket_number(Value)=1),
 Row(Value='79', bitmap_bucket_number(Value)=1),
 Row(Value='89', bitmap_bucket_number(Value)=1),
 Row(Value='88', bitmap_bucket_number(Value)=1),
 Row(Value='91', bitmap_bucket_number(Value)=1)]

#### pyspark.sql.functions.bitmap_construct_agg(col: ColumnOrName) → pyspark.sql.column.Column
Returns a bitmap with the positions of the bits set from all the values from the input column. The input column will most likely be bitmap_bit_position().

In [24]:
df = spark.createDataFrame([(1,),(2,),(3,)], ["a"])
df.select(substring(hex(
    bitmap_construct_agg(bitmap_bit_position(df.a))
), 0, 6).alias("r")).collect()

[Row(r='070000')]

In [25]:
students.limit(10).select("math score", bitmap_bit_position("math score")).collect()

[Row(math score=72, bitmap_bit_position(math score)=71),
 Row(math score=69, bitmap_bit_position(math score)=68),
 Row(math score=90, bitmap_bit_position(math score)=89),
 Row(math score=47, bitmap_bit_position(math score)=46),
 Row(math score=76, bitmap_bit_position(math score)=75),
 Row(math score=71, bitmap_bit_position(math score)=70),
 Row(math score=88, bitmap_bit_position(math score)=87),
 Row(math score=40, bitmap_bit_position(math score)=39),
 Row(math score=64, bitmap_bit_position(math score)=63),
 Row(math score=38, bitmap_bit_position(math score)=37)]

In [26]:
students.select(substring(hex(
    bitmap_construct_agg(bitmap_bit_position("math score"))
), 0, 6).alias("r")).collect()

[Row(r='8100E6')]

In [27]:
students.select(hex(
    bitmap_construct_agg(bitmap_bit_position("math score"))
).alias("r")).collect()

[Row(r='8100E6BEFFFFFFFFFFFFFFFF0F000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [28]:
students.select(bitmap_construct_agg(bitmap_bit_position("math score")).alias("r")).collect()

[Row(r=bytearray(b'\x81\x00\xe6\xbe\xff\xff\xff\xff\xff\xff\xff\xff\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\

In [29]:
students.limit(20).select(substring(hex(
    bitmap_construct_agg(bitmap_bit_position("math score"))
), 0, 6).alias("r")).collect()

[Row(r='000002')]

In [30]:
students.limit(20).select(hex(bitmap_construct_agg(bitmap_bit_position("math score"))).alias("r")).collect()

[Row(r='00000200A0602282D128800200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

#### pyspark.sql.functions.bitmap_count(col: ColumnOrName) → pyspark.sql.column.Column
Returns the number of set bits in the input bitmap.

In [31]:
df = spark.createDataFrame([("FFFF",)], ["a"])
df.select(bitmap_count(to_binary(df.a, lit("hex"))).alias('r')).collect()

[Row(r=16)]

In [32]:
students.limit(10).select(bitmap_count(to_binary("math score", lit("hex"))).alias("r")).collect()

[Row(r=4),
 Row(r=4),
 Row(r=2),
 Row(r=4),
 Row(r=5),
 Row(r=4),
 Row(r=2),
 Row(r=1),
 Row(r=3),
 Row(r=3)]

#### pyspark.sql.functions.bitmap_or_agg(col: ColumnOrName) → pyspark.sql.column.Column
Returns a bitmap that is the bitwise OR of all of the bitmaps from the input column. The input column should be bitmaps created from bitmap_construct_agg().

In [33]:
df = spark.createDataFrame([("10",),("20",),("40",)], ["a"])
df.select(substring(hex(
    bitmap_or_agg(to_binary(df.a, lit("hex")))
), 0, 6).alias("r")).collect()

[Row(r='700000')]

In [34]:
students.limit(10).select(substring(hex(
    bitmap_or_agg(to_binary("math score", lit("hex")))), 0, 6).alias("r")).collect()

[Row(r='FF0000')]

#### pyspark.sql.functions.current_catalog() → pyspark.sql.column.Column
Returns the current catalog.

In [35]:
spark.range(1).select(current_catalog()).show()

+-----------------+
|current_catalog()|
+-----------------+
|    spark_catalog|
+-----------------+



#### pyspark.sql.functions.current_database() → pyspark.sql.column.Column[source]
Returns the current database.

In [36]:
spark.range(1).select(current_database()).show()

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



#### pyspark.sql.functions.current_schema() → pyspark.sql.column.Column
Returns the current database.

In [37]:
spark.range(1).select(current_schema()).show()

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



#### pyspark.sql.functions.current_user() → pyspark.sql.column.Column

In [38]:
spark.range(1).select(current_user()).show()

+--------------+
|current_user()|
+--------------+
|    zsavchenko|
+--------------+



#### pyspark.sql.functions.input_file_block_length() → pyspark.sql.column.Column
Returns the length of the block being read, or -1 if not available.

In [39]:
df = spark.read.text(path+"students.csv", lineSep=",")
df.select(input_file_block_length().alias('r')).first()

Row(r=72036)

In [40]:
df = spark.read.text(path+"pga_tour_historical.csv", lineSep=",")
df.select(input_file_block_length().alias('r')).first()

Row(r=21305899)

#### pyspark.sql.functions.input_file_block_start() → pyspark.sql.column.Column
Returns the start offset of the block being read, or -1 if not available.

In [41]:
df = spark.read.text(path+"students.csv", lineSep=",")
df.select(input_file_block_start().alias('r')).first()

Row(r=0)

In [42]:
df = spark.read.text(path+"pga_tour_historical.csv", lineSep=",")
df.select(input_file_block_start().alias('r')).first()

Row(r=0)

#### pyspark.sql.functions.md5(col: ColumnOrName) → pyspark.sql.column.Column
Calculates the MD5 digest and returns the value as a 32 character hex string.

In [43]:
spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()

[Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')]

#### pyspark.sql.functions.sha(col: ColumnOrName) → pyspark.sql.column.Column
Returns a sha1 hash value as a hex string of the col.

In [44]:
spark.range(1).select(sha(lit("Spark"))).show(1, False)

+----------------------------------------+
|sha(Spark)                              |
+----------------------------------------+
|85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c|
+----------------------------------------+



#### pyspark.sql.functions.sha1(col: ColumnOrName) → pyspark.sql.column.Column
Returns the hex string result of SHA-1.

In [45]:
spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()

[Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]

#### pyspark.sql.functions.sha2(col: ColumnOrName, numBits: int) → pyspark.sql.column.Column
Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, and SHA-512). The numBits indicates the desired bit length of the result, which must have a value of 224, 256, 384, 512, or 0 (which is equivalent to 256).

Parameters: 
- col: Column or str | target column to compute on.
- num: Bitsint | the desired bit length of the result, which must have a value of 224, 256, 384, 512, or 0 (which is equivalent to 256).

Returns – Column | the column for computed results.

In [46]:
df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"])
df.withColumn("sha2", sha2(df.name, 256)).show(truncate=False)

+-----+----------------------------------------------------------------+
|name |sha2                                                            |
+-----+----------------------------------------------------------------+
|Alice|3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043|
|Bob  |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961|
+-----+----------------------------------------------------------------+



#### pyspark.sql.functions.crc32(col: ColumnOrName) → pyspark.sql.column.Column
Calculates the cyclic redundancy check value (CRC32) of a binary column and returns the value as a bigint.

In [47]:
spark.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect()

[Row(crc32=2743272264)]

#### pyspark.sql.functions.hash(*cols: ColumnOrName) → pyspark.sql.column.Column
Calculates the hash code of given columns, and returns the result as an int column.

In [48]:
df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2'])

df.select(hash('c1').alias('hash')).show()

df.select(hash('c1', 'c2').alias('hash')).show()

+----------+
|      hash|
+----------+
|-757602832|
+----------+

+---------+
|     hash|
+---------+
|599895104|
+---------+



#### pyspark.sql.functions.xxhash64(*cols: ColumnOrName) → pyspark.sql.column.Column
Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm, and returns the result as a long column. The hash computation uses an initial seed of 42.

In [49]:
df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2'])

df.select(xxhash64('c1').alias('hash')).show()

df.select(xxhash64('c1', 'c2').alias('hash')).show()

+-------------------+
|               hash|
+-------------------+
|4105715581806190027|
+-------------------+

+-------------------+
|               hash|
+-------------------+
|3233247871021311208|
+-------------------+



#### pyspark.sql.functions.assert_true(col: ColumnOrName, errMsg: Union[pyspark.sql.column.Column, str, None] = None) → pyspark.sql.column.Column
Returns null if the input column is true; throws an exception with the provided error message otherwise.

Parameters: 
- col: Column or str | column name or column that represents the input column to test.
- errMsg: Column or str, optional | A Python string literal or column containing the error message.

Returns – Column | null if the input column is true otherwise throws an error with specified message.

In [50]:
df = spark.createDataFrame([(0,1)], ['a', 'b'])
df.select(assert_true(df.a < df.b).alias('r')).collect()

[Row(r=None)]

In [51]:
df.select(assert_true(df.a < df.b, df.a).alias('r')).collect()

[Row(r=None)]

In [52]:
df.select(assert_true(df.a < df.b, 'error').alias('r')).collect()

[Row(r=None)]

In [53]:
from py4j.protocol import Py4JJavaError

try:
    df.select(assert_true(df.a > df.b, 'My error msg').alias('r')).collect()
except Py4JJavaError as e:
    print("My error msg" in e.__str__())

True


23/10/22 15:48:38 ERROR Executor: Exception in task 11.0 in stage 91.0 (TID 496)
java.lang.RuntimeException: My error msg
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.run

#### pyspark.sql.functions.raise_error(errMsg: Union[pyspark.sql.column.Column, str]) → pyspark.sql.column.Column
Throws an exception with the provided error message.

In [54]:
from py4j.protocol import Py4JJavaError

df = spark.range(1)

try:
    df.select(raise_error("My error msg")).show()
except Py4JJavaError as e:
    print("My error msg" in e.__str__())

True


23/10/22 15:48:38 ERROR Executor: Exception in task 11.0 in stage 92.0 (TID 508)
java.lang.RuntimeException: My error msg
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.run

#### pyspark.sql.functions.reflect(*cols: ColumnOrName) → pyspark.sql.column.Column
Calls a method with reflection.

Parameters – cols: Column or str | the first element should be a literal string for the class name, and the second element should be a literal string for the method name, and the remaining are input arguments to the Java method.

In [55]:
df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"])
df.select(
    reflect(lit("java.util.UUID"), lit("fromString"), df.a).alias('r')
).collect()

spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)

23/10/22 15:48:38 ERROR TaskSetManager: Task 11 in stage 92.0 failed 1 times; aborting job


[Row(r='a5cf6c42-0c85-418f-af6c-3e4e5b1328f2')]

#### pyspark.sql.functions.hll_sketch_estimate(col: ColumnOrName) → pyspark.sql.column.Column
Returns the estimated number of unique values given the binary representation of a Datasketches HllSketch.

[why] why not use hll_sketch_estimate all time?

Datasketches HyperLogLog (HLL) Sketch - це структура даних, призначена для наближеного підрахунку унікальних елементів у великих наборах даних. Вона використовує алгоритм HyperLogLog для оцінки кількості різних елементів у масиві, не зберігаючи всі ці елементи, а лише важливі характеристики.

In [56]:
df = spark.createDataFrame([1,2,2,3], "INT")
df = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))
df.show()

+------------+
|distinct_cnt|
+------------+
|           3|
+------------+



In [57]:
from time import time
start = time()
tour.agg(hll_sketch_estimate(hll_sketch_agg("Value")).alias("distinct_cnt")).show()
print(f"execution_time - {time() - start}")

[Stage 97:>                                                       (0 + 12) / 12]

+------------+
|distinct_cnt|
+------------+
|      179970|
+------------+

execution_time - 1.1872360706329346


                                                                                

In [58]:
start = time()
print(tour.select("Value").distinct().count())
print(f"execution_time - {time() - start}")

[Stage 100:====>                                                  (1 + 11) / 12]

180981
execution_time - 1.5197079181671143


                                                                                

In [59]:
start = time()
students.agg(hll_sketch_estimate(hll_sketch_agg("math score")).alias("distinct_cnt")).show()
print(f"execution_time - {time() - start}")

+------------+
|distinct_cnt|
+------------+
|          81|
+------------+

execution_time - 0.07657694816589355


In [60]:
start = time()
print(students.select("math score").distinct().count())
print(f"execution_time - {time() - start}")

81
execution_time - 0.23991703987121582


#### pyspark.sql.functions.hll_union(col1: ColumnOrName, col2: ColumnOrName, allowDifferentLgConfigK: Optional[bool] = None) → pyspark.sql.column.Column
Merges two binary representations of Datasketches HllSketch objects, using a Datasketches Union object. Throws an exception if sketches have different lgConfigK values and allowDifferentLgConfigK is unset or set to false.

In [61]:
df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>")
df = df.agg(hll_sketch_agg("v1").alias("sketch1"), hll_sketch_agg("v2").alias("sketch2"))
df = df.withColumn("distinct_cnt", hll_sketch_estimate(hll_union("sketch1", "sketch2")))
df.show()
df.drop("sketch1", "sketch2").show()

+--------------------+--------------------+------------+
|             sketch1|             sketch2|distinct_cnt|
+--------------------+--------------------+------------+
|[02 01 07 0C 03 0...|[02 01 07 0C 03 0...|           6|
+--------------------+--------------------+------------+

+------------+
|distinct_cnt|
+------------+
|           6|
+------------+



In [62]:
df = students.agg(hll_sketch_agg("math score").alias("sketch1"),
                  hll_sketch_agg("reading score").alias("sketch2"),
                  hll_sketch_agg("writing score").alias("sketch3"),)
df = (df.withColumn("distinct_cnt1", hll_sketch_estimate(hll_union("sketch1", "sketch2")))
        .withColumn("distinct_cnt2", hll_sketch_estimate(hll_union("sketch1", "sketch3")))
        .withColumn("distinct_cnt3", hll_sketch_estimate(hll_union("sketch2", "sketch3")))
        .withColumn("distinct_cnt4", hll_sketch_estimate(hll_union("sketch3", "sketch2")))
)
df.show()
df.drop("sketch1", "sketch2", "sketch3").show()

+--------------------+--------------------+--------------------+-------------+-------------+-------------+-------------+
|             sketch1|             sketch2|             sketch3|distinct_cnt1|distinct_cnt2|distinct_cnt3|distinct_cnt4|
+--------------------+--------------------+--------------------+-------------+-------------+-------------+-------------+
|[03 01 07 0C 07 0...|[03 01 07 0C 07 0...|[03 01 07 0C 07 0...|           83|           83|           82|           82|
+--------------------+--------------------+--------------------+-------------+-------------+-------------+-------------+

+-------------+-------------+-------------+-------------+
|distinct_cnt1|distinct_cnt2|distinct_cnt3|distinct_cnt4|
+-------------+-------------+-------------+-------------+
|           83|           83|           82|           82|
+-------------+-------------+-------------+-------------+



#### pyspark.sql.functions.java_method(*cols: ColumnOrName) → pyspark.sql.column.Column
Calls a method with reflection.

Parameters – cols: Column or str | the first element should be a literal string for the class name, and the second element should be a literal string for the method name, and the remaining are input arguments to the Java method.

In [63]:
spark.range(1).select(
    java_method(
        lit("java.util.UUID"),
        lit("fromString"),
        lit("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2")
    )
).show(truncate=False)

+-----------------------------------------------------------------------------+
|java_method(java.util.UUID, fromString, a5cf6c42-0c85-418f-af6c-3e4e5b1328f2)|
+-----------------------------------------------------------------------------+
|a5cf6c42-0c85-418f-af6c-3e4e5b1328f2                                         |
+-----------------------------------------------------------------------------+



#### pyspark.sql.functions.stack(*cols: ColumnOrName) → pyspark.sql.column.Column
Separates col1, …, colk into n rows. Uses column names col0, col1, etc. by default unless specified otherwise.

Parameters – cols: Column or str | the first element should be a literal int for the number of rows to be separated, and the remaining are input elements to be separated.

In [64]:
df = spark.createDataFrame([(1, 2, 3)], ["a", "b", "c"])
df.select(stack(lit(2), df.a, df.b, df.c)).show(truncate=False)

+----+----+
|col0|col1|
+----+----+
|1   |2   |
|3   |NULL|
+----+----+



In [65]:
df = spark.createDataFrame([(1, 2, 3, 4), (1, 2, 3, 4)], ["a", "b", "c", "d"])
df.select(stack(lit(2), df.a, df.b, df.c, df.d)).show(truncate=False)

+----+----+
|col0|col1|
+----+----+
|1   |2   |
|3   |4   |
|1   |2   |
|3   |4   |
+----+----+



In [66]:
df.select(stack(lit(3), df.a, df.b, df.c, df.d)).show(truncate=False)

+----+----+
|col0|col1|
+----+----+
|1   |2   |
|3   |4   |
|NULL|NULL|
|1   |2   |
|3   |4   |
|NULL|NULL|
+----+----+



In [67]:
df.select(stack(lit(1), df.a, df.b, df.c, df.d)).show(truncate=False)

+----+----+----+----+
|col0|col1|col2|col3|
+----+----+----+----+
|1   |2   |3   |4   |
|1   |2   |3   |4   |
+----+----+----+----+



In [68]:
df.select(stack(lit(4), df.a, df.b, df.c, df.d)).show(truncate=False)

+----+
|col0|
+----+
|1   |
|2   |
|3   |
|4   |
|1   |
|2   |
|3   |
|4   |
+----+



#### pyspark.sql.functions.try_aes_decrypt(input: ColumnOrName, key: ColumnOrName, mode: Optional[ColumnOrName] = None, padding: Optional[ColumnOrName] = None, aad: Optional[ColumnOrName] = None) → pyspark.sql.column.Column
This is a special version of aes_decrypt that performs the same operation, but returns a NULL value instead of raising an error if the decryption cannot be performed. Returns a decrypted value of input using AES in mode with padding. Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (mode, padding) are (‘ECB’, ‘PKCS’), (‘GCM’, ‘NONE’) and (‘CBC’, ‘PKCS’). Optional additional authenticated data (AAD) is only supported for GCM. If provided for encryption, the identical AAD value must be provided for decryption. The default mode is GCM.

Parameters: 
- input: Column or str | 
The binary value to decrypt.
- key: Column or str | 
The passphrase to use to decrypt the data.
- mode: Column or str, optional |
Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB, GCM, CBC.
- padding: Column or str, optional | 
Specifies how to pad messages whose length is not a multiple of the block size. Valid values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS for CBC.
- aad: Column or str, optional | 
Optional additional authenticated data. Only supported for GCM mode. This can be any free-form input and must be provided for both encryption and decryption.

In [69]:
df = spark.createDataFrame([(
    "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4",
    "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",
    "This is an AAD mixed into the input",)],
    ["input", "key", "mode", "padding", "aad"]
)
df.select(try_aes_decrypt(
    unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias('r')
).collect()

[Row(r=bytearray(b'Spark'))]

In [70]:
df = spark.createDataFrame([(
    "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=",
    "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)],
    ["input", "key", "mode", "padding"]
)
df.select(try_aes_decrypt(
    unbase64(df.input), df.key, df.mode, df.padding).alias('r')
).collect()

[Row(r=bytearray(b'Spark'))]

In [71]:
df.select(try_aes_decrypt(unbase64(df.input), df.key, df.mode).alias('r')).collect()

[Row(r=bytearray(b'Spark'))]

In [72]:
df = spark.createDataFrame([(
    "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",
    "0000111122223333",)],
    ["input", "key"]
)
df.select(try_aes_decrypt(unhex(df.input), df.key).alias('r')).collect()

[Row(r=bytearray(b'Spark'))]

#### pyspark.sql.functions.typeof(col: ColumnOrName) → pyspark.sql.column.Column
Return DDL-formatted type string for the data type of the input.

In [73]:
df = spark.createDataFrame([(1,)], ["a"])
df.select(typeof(df.a).alias('r')).collect()

[Row(r='bigint')]

#### pyspark.sql.functions.user() → pyspark.sql.column.Column
Returns the current database.

In [74]:
spark.range(1).select(user()).show() 

+--------------+
|current_user()|
+--------------+
|    zsavchenko|
+--------------+



#### pyspark.sql.functions.version() → pyspark.sql.column.Column
Returns the Spark version. The string contains 2 fields, the first being a release version and the second being a git revision.

In [75]:
df = spark.range(1)
df.select(version()).show(truncate=False) 

+----------------------------------------------+
|version()                                     |
+----------------------------------------------+
|3.5.0 ce5ddad990373636e94071e7cef2f31021add07b|
+----------------------------------------------+



## [Predicate Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#predicate-functions)

Parameters:
* col1: Column or str
* col2: Column or str
* col3: Column or str

#### pyspark.sql.functions.equal_null(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column¶
Returns same result as the EQUAL(=) operator for non-null operands, but returns true if both are null, false if one of the them is null.

In [76]:
df = spark.createDataFrame([(None, None,), (1, 9,), (1, 1,),], ["a", "b"])
df.show()
df.select(equal_null(df.a, df.b).alias('r')).collect()

+----+----+
|   a|   b|
+----+----+
|NULL|NULL|
|   1|   9|
|   1|   1|
+----+----+



[Row(r=True), Row(r=False), Row(r=True)]

#### pyspark.sql.functions.ifnull(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns col2 if col1 is null, or col1 otherwise.

In [77]:
df = spark.createDataFrame([(None, 12,), (1, None,), (None, None,), (1, 9,),], ["e", "e2"])
df.show()
df.select(ifnull(df.e, df.e2)).show()

+----+----+
|   e|  e2|
+----+----+
|NULL|  12|
|   1|NULL|
|NULL|NULL|
|   1|   9|
+----+----+

+-------------+
|ifnull(e, e2)|
+-------------+
|           12|
|            1|
|         NULL|
|            1|
+-------------+



#### pyspark.sql.functions.isnotnull(col: ColumnOrName) → pyspark.sql.column.Column
Returns true if col is not null, or false otherwise.

In [78]:
df = spark.createDataFrame([(None,), (1,)], ["e"])
df.show()
df.select(isnotnull(df.e).alias('r')).collect()

+----+
|   e|
+----+
|NULL|
|   1|
+----+



[Row(r=False), Row(r=True)]

#### pyspark.sql.functions.nullif(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns null if col1 equals to col2, or col1 otherwise.

In [79]:
df = spark.createDataFrame([(None, None,), (1, 9,), (1, 1,), (None, 12,), (1, None,),], ["a", "b"])
df.show()
df.select(nullif(df.a, df.b).alias('r')).collect()

+----+----+
|   a|   b|
+----+----+
|NULL|NULL|
|   1|   9|
|   1|   1|
|NULL|  12|
|   1|NULL|
+----+----+



[Row(r=None), Row(r=1), Row(r=None), Row(r=None), Row(r=1)]

#### pyspark.sql.functions.nvl(col1: ColumnOrName, col2: ColumnOrName) → pyspark.sql.column.Column
Returns col2 if col1 is null, or col1 otherwise.

In [80]:
df = spark.createDataFrame([(None, 12,), (1, None,), (None, None,), (1, 9,),], ["a", "b"])
df.show()
df.select(nvl(df.a, df.b).alias('r')).collect()

+----+----+
|   a|   b|
+----+----+
|NULL|  12|
|   1|NULL|
|NULL|NULL|
|   1|   9|
+----+----+



[Row(r=12), Row(r=1), Row(r=None), Row(r=1)]

#### pyspark.sql.functions.nvl2(col1: ColumnOrName, col2: ColumnOrName, col3: ColumnOrName) → pyspark.sql.column.Column
Returns col2 if col1 is not null, or col3 otherwise.

In [81]:
df = spark.createDataFrame([(None, 8, 6,), (1, 8, 9,)], ["a", "b", "c"])
df.select(nvl2(df.a, df.b, df.c).alias('r')).collect()

[Row(r=6), Row(r=8)]

## [Xml Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#xml-functions)

#### pyspark.sql.functions.xpath(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns a string array of values within the nodes of xml that match the XPath expression.

In [82]:
df = spark.createDataFrame(
    [('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>',)], ['x'])
df.select(xpath(df.x, lit('a/b/text()')).alias('r')).collect()

[Row(r=['b1', 'b2', 'b3'])]

#### pyspark.sql.functions.xpath_boolean(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column¶
Returns true if the XPath expression evaluates to true, or if a matching node is found.

In [83]:
df = spark.createDataFrame([('<a><b>1</b></a>',)], ['x'])
df.select(xpath_boolean(df.x, lit('a/b')).alias('r')).collect()

[Row(r=True)]

#### pyspark.sql.functions.xpath_double(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.

In [84]:
df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
df.select(xpath_double(df.x, lit('sum(a/b)')).alias('r')).collect()

[Row(r=3.0)]

#### pyspark.sql.functions.xpath_int(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns an integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.

In [85]:
df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
df.select(xpath_int(df.x, lit('sum(a/b)')).alias('r')).collect()

[Row(r=3)]

#### pyspark.sql.functions.xpath_long(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns a long integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.

In [86]:
df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
df.select(xpath_long(df.x, lit('sum(a/b)')).alias('r')).collect()

[Row(r=3)]

#### pyspark.sql.functions.xpath_number(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.

In [87]:
spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x']).select(xpath_number('x', lit('sum(a/b)'))).show()

+-------------------------+
|xpath_number(x, sum(a/b))|
+-------------------------+
|                      3.0|
+-------------------------+



#### pyspark.sql.functions.xpath_short(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns a short integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.

In [88]:
df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])
df.select(xpath_short(df.x, lit('sum(a/b)')).alias('r')).collect()

[Row(r=3)]

#### pyspark.sql.functions.xpath_string(xml: ColumnOrName, path: ColumnOrName) → pyspark.sql.column.Column
Returns the text contents of the first xml node that matches the XPath expression.

In [89]:
df = spark.createDataFrame([('<a><b>b</b><c>cc</c></a>',)], ['x'])
df.select(xpath_string(df.x, lit('a/c')).alias('r')).collect()

[Row(r='cc')]