In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext(master="local")
spark = SparkSession.builder\
         .appName("Python Spark SQL Basic Example")\
         .config("spark.some.config.option","some-value")\
         .getOrCreate()

# Import data

In [3]:
iris = spark.read.csv('../../data/iris.csv', header=True, inferSchema=True)
iris.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [5]:
prostate = spark.read.csv('../../data/prostate.csv', header=True, inferSchema=True)
prostate.show(5)

+------------+-----------+---+------------+---+------------+-------+-----+------------+
|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|
|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|
|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
only showing top 5 rows



# Functions

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import pandas as pd

## abs

In [9]:
prostate.select('lpsa', abs(prostate.lpsa).alias('abs(lpsa)')).show(5)

+------------+-----------+
|        lpsa|  abs(lpsa)|
+------------+-----------+
|-0.430782916|0.430782916|
|-0.162518929|0.162518929|
|-0.162518929|0.162518929|
|-0.162518929|0.162518929|
| 0.371563556|0.371563556|
+------------+-----------+
only showing top 5 rows



## acos

In [10]:
pdf = pd.DataFrame({
    "x":list(-np.random.rand(5))+list(np.random.rand(5))
})
df = spark.createDataFrame(pdf)
df.show(5)

+-------------------+
|                  x|
+-------------------+
|-0.8205466165206964|
|-0.7183551207610612|
|-0.8673091252468418|
|-0.6628664944477599|
|-0.7034586331290031|
+-------------------+
only showing top 5 rows



In [11]:
df.select("x",acos(df.x)).show(5)

+-------------------+------------------+
|                  x|           ACOS(x)|
+-------------------+------------------+
|-0.8205466165206964| 2.533163016281679|
|-0.7183551207610612| 2.372231319141086|
|-0.8673091252468418|2.6205670579277704|
|-0.6628664944477599| 2.295437067430179|
|-0.7034586331290031| 2.351048447865742|
+-------------------+------------------+
only showing top 5 rows



## add_months

In [13]:
import datetime

In [15]:
base = datetime.date.today()
date_list = [base+datetime.timedelta(days=x) for x in list(range(0,10))*10]
pdf = pd.DataFrame({
    'dates':date_list
})
df = spark.createDataFrame(pdf)
df.show(5)

+----------+
|     dates|
+----------+
|2020-06-12|
|2020-06-13|
|2020-06-14|
|2020-06-15|
|2020-06-16|
+----------+
only showing top 5 rows



In [16]:
df.select("dates",add_months(df.dates,2).alias("new_dates")).show(5)

+----------+----------+
|     dates| new_dates|
+----------+----------+
|2020-06-12|2020-08-12|
|2020-06-13|2020-08-13|
|2020-06-14|2020-08-14|
|2020-06-15|2020-08-15|
|2020-06-16|2020-08-16|
+----------+----------+
only showing top 5 rows



## approx_count_distinct

In [17]:
prostate.select(approx_count_distinct(prostate.gleason)).show(5)

+------------------------------+
|approx_count_distinct(gleason)|
+------------------------------+
|                             4|
+------------------------------+



## array

In [18]:
iris.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [19]:
df_arr = iris.select("species",array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']).alias("features"))

In [20]:
df_arr.show()

+-------+--------------------+
|species|            features|
+-------+--------------------+
| setosa|[5.1, 3.5, 1.4, 0.2]|
| setosa|[4.9, 3.0, 1.4, 0.2]|
| setosa|[4.7, 3.2, 1.3, 0.2]|
| setosa|[4.6, 3.1, 1.5, 0.2]|
| setosa|[5.0, 3.6, 1.4, 0.2]|
| setosa|[5.4, 3.9, 1.7, 0.4]|
| setosa|[4.6, 3.4, 1.4, 0.3]|
| setosa|[5.0, 3.4, 1.5, 0.2]|
| setosa|[4.4, 2.9, 1.4, 0.2]|
| setosa|[4.9, 3.1, 1.5, 0.1]|
| setosa|[5.4, 3.7, 1.5, 0.2]|
| setosa|[4.8, 3.4, 1.6, 0.2]|
| setosa|[4.8, 3.0, 1.4, 0.1]|
| setosa|[4.3, 3.0, 1.1, 0.1]|
| setosa|[5.8, 4.0, 1.2, 0.2]|
| setosa|[5.7, 4.4, 1.5, 0.4]|
| setosa|[5.4, 3.9, 1.3, 0.4]|
| setosa|[5.1, 3.5, 1.4, 0.3]|
| setosa|[5.7, 3.8, 1.7, 0.3]|
| setosa|[5.1, 3.8, 1.5, 0.3]|
+-------+--------------------+
only showing top 20 rows



## array_contains

In [21]:
df = df_arr.select("species","features",array_contains(df_arr.features,1.4).alias("new_features"))
df.show()

+-------+--------------------+------------+
|species|            features|new_features|
+-------+--------------------+------------+
| setosa|[5.1, 3.5, 1.4, 0.2]|        true|
| setosa|[4.9, 3.0, 1.4, 0.2]|        true|
| setosa|[4.7, 3.2, 1.3, 0.2]|       false|
| setosa|[4.6, 3.1, 1.5, 0.2]|       false|
| setosa|[5.0, 3.6, 1.4, 0.2]|        true|
| setosa|[5.4, 3.9, 1.7, 0.4]|       false|
| setosa|[4.6, 3.4, 1.4, 0.3]|        true|
| setosa|[5.0, 3.4, 1.5, 0.2]|       false|
| setosa|[4.4, 2.9, 1.4, 0.2]|        true|
| setosa|[4.9, 3.1, 1.5, 0.1]|       false|
| setosa|[5.4, 3.7, 1.5, 0.2]|       false|
| setosa|[4.8, 3.4, 1.6, 0.2]|       false|
| setosa|[4.8, 3.0, 1.4, 0.1]|        true|
| setosa|[4.3, 3.0, 1.1, 0.1]|       false|
| setosa|[5.8, 4.0, 1.2, 0.2]|       false|
| setosa|[5.7, 4.4, 1.5, 0.4]|       false|
| setosa|[5.4, 3.9, 1.3, 0.4]|       false|
| setosa|[5.1, 3.5, 1.4, 0.3]|        true|
| setosa|[5.7, 3.8, 1.7, 0.3]|       false|
| setosa|[5.1, 3.8, 1.5, 0.3]|  

## asc
asc returns a **sort expression**, which can be used as argument of sort functions such as pyspark.sql.DataFrame.sort and pyspar.sql.DataFrame.orderBy

In [22]:
prostate.sort(prostate.lpsa.asc()).show(5)

+------------+-----------+---+------------+---+------------+-------+-----+------------+
|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|
|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|
| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
only showing top 5 rows



In [23]:
prostate.orderBy(prostate.lpsa.asc()).show(5)

+------------+-----------+---+------------+---+------------+-------+-----+------------+
|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|
|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|
| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
only showing top 5 rows



* ascii
* ascin
* atan
* atan2

## avg

In [24]:
prostate.select(avg(prostate.lpsa)).show()

+------------------+
|         avg(lpsa)|
+------------------+
|2.4783868787422683|
+------------------+



## cbrt

In [26]:
prostate.select("lpsa",cbrt(prostate.lpsa)).show(5)

+------------+-------------------+
|        lpsa|         CBRT(lpsa)|
+------------+-------------------+
|-0.430782916|-0.7552420410177275|
|-0.162518929|-0.5457176294010901|
|-0.162518929|-0.5457176294010901|
|-0.162518929|-0.5457176294010901|
| 0.371563556| 0.7189152621521183|
+------------+-------------------+
only showing top 5 rows



## ceil

In [27]:
prostate.select("lpsa",ceil(prostate.lpsa)).show(5)

+------------+----------+
|        lpsa|CEIL(lpsa)|
+------------+----------+
|-0.430782916|         0|
|-0.162518929|         0|
|-0.162518929|         0|
|-0.162518929|         0|
| 0.371563556|         1|
+------------+----------+
only showing top 5 rows



## coalesce

In [28]:
df = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
df.show()

+----+----+
|   a|   b|
+----+----+
|null|null|
|   1|null|
|null|   2|
+----+----+



In [29]:
df.select(coalesce(df.a,df.b)).show()

+--------------+
|coalesce(a, b)|
+--------------+
|          null|
|             1|
|             2|
+--------------+



## col
Returns a Column based on the given column name. 

In [32]:
prostate.show(5)

+------------+-----------+---+------------+---+------------+-------+-----+------------+
|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|
|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|
|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
only showing top 5 rows



In [33]:
prostate.select(col("lcavol"),col("age")).show(5)

+------------+---+
|      lcavol|age|
+------------+---+
|-0.579818495| 50|
|-0.994252273| 58|
|-0.510825624| 74|
|-1.203972804| 58|
| 0.751416089| 62|
+------------+---+
only showing top 5 rows



## collect_list

In [34]:
pdf = pd.DataFrame({
    "x":[1,2,2,3,4,4,4,4]
})
df = spark.createDataFrame(pdf)
df.show()

+---+
|  x|
+---+
|  1|
|  2|
|  2|
|  3|
|  4|
|  4|
|  4|
|  4|
+---+



In [35]:
df.select(collect_list(df.x)).show()

+--------------------+
|     collect_list(x)|
+--------------------+
|[1, 2, 2, 3, 4, 4...|
+--------------------+



## collect_set

In [36]:
df.select(collect_set(df.x)).show()

+--------------+
|collect_set(x)|
+--------------+
|  [1, 2, 3, 4]|
+--------------+



## concat

In [37]:
df = spark.createDataFrame([['a', '1'], ['b', '2']], ['x', 'v'])
df.show()

+---+---+
|  x|  v|
+---+---+
|  a|  1|
|  b|  2|
+---+---+



In [38]:
df.select("x","v",concat(df.x,df.v).alias("concate(x,v)")).show()

+---+---+------------+
|  x|  v|concate(x,v)|
+---+---+------------+
|  a|  1|          a1|
|  b|  2|          b2|
+---+---+------------+



## concat_ws

In [40]:
df.select("x","v",concat_ws("_",df.x,df.v).alias("concate(x,v)")).show()

+---+---+------------+
|  x|  v|concate(x,v)|
+---+---+------------+
|  a|  1|         a_1|
|  b|  2|         b_2|
+---+---+------------+



## Corr

In [42]:
prostate.show(5)

+------------+-----------+---+------------+---+------------+-------+-----+------------+
|      lcavol|    lweight|age|        lbph|svi|         lcp|gleason|pgg45|        lpsa|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
|-0.579818495|2.769458829| 50|-1.386294361|  0|-1.386294361|      6|    0|-0.430782916|
|-0.994252273|3.319625728| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
|-0.510825624|2.691243083| 74|-1.386294361|  0|-1.386294361|      7|   20|-0.162518929|
|-1.203972804|3.282789151| 58|-1.386294361|  0|-1.386294361|      6|    0|-0.162518929|
| 0.751416089|3.432372999| 62|-1.386294361|  0|-1.386294361|      6|    0| 0.371563556|
+------------+-----------+---+------------+---+------------+-------+-----+------------+
only showing top 5 rows



In [44]:
prostate.select(corr(prostate.age,prostate.lpsa)).show(5)

+-------------------+
|    corr(age, lpsa)|
+-------------------+
|0.16959284228582772|
+-------------------+



## count

In [45]:
prostate.select(count(prostate.lpsa)).show()

+-----------+
|count(lpsa)|
+-----------+
|         97|
+-----------+



## countDistinct

In [46]:
iris.select(count(iris.species)).show()

+--------------+
|count(species)|
+--------------+
|           150|
+--------------+



## covar_pop

In [47]:
prostate.select(covar_pop(prostate.age,prostate.lpsa)).show()

+--------------------+
|covar_pop(age, lpsa)|
+--------------------+
|  1.4424746293984458|
+--------------------+



## covar_samp

In [49]:
prostate.select(covar_samp(prostate.age,prostate.lpsa)).show()

+---------------------+
|covar_samp(age, lpsa)|
+---------------------+
|   1.4575004067880128|
+---------------------+



## create_map

In [50]:
iris.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [51]:
df = iris.select(create_map('species',"sepal_length"))
df.show()

+--------------------------+
|map(species, sepal_length)|
+--------------------------+
|           [setosa -> 5.1]|
|           [setosa -> 4.9]|
|           [setosa -> 4.7]|
|           [setosa -> 4.6]|
|           [setosa -> 5.0]|
|           [setosa -> 5.4]|
|           [setosa -> 4.6]|
|           [setosa -> 5.0]|
|           [setosa -> 4.4]|
|           [setosa -> 4.9]|
|           [setosa -> 5.4]|
|           [setosa -> 4.8]|
|           [setosa -> 4.8]|
|           [setosa -> 4.3]|
|           [setosa -> 5.8]|
|           [setosa -> 5.7]|
|           [setosa -> 5.4]|
|           [setosa -> 5.1]|
|           [setosa -> 5.7]|
|           [setosa -> 5.1]|
+--------------------------+
only showing top 20 rows



In [52]:
df.dtypes

[('map(species, sepal_length)', 'map<string,double>')]

## current_date

In [53]:
df = spark.createDataFrame([[1],[2],[3],[4]], ['x'])
df.show()

+---+
|  x|
+---+
|  1|
|  2|
|  3|
|  4|
+---+



In [55]:
df.select("x",current_date()).show()

+---+--------------+
|  x|current_date()|
+---+--------------+
|  1|    2020-06-12|
|  2|    2020-06-12|
|  3|    2020-06-12|
|  4|    2020-06-12|
+---+--------------+



## current_tmestamp

In [56]:
df.select("x",current_timestamp()).show(truncate=False)

+---+----------------------+
|x  |current_timestamp()   |
+---+----------------------+
|1  |2020-06-12 15:51:33.19|
|2  |2020-06-12 15:51:33.19|
|3  |2020-06-12 15:51:33.19|
|4  |2020-06-12 15:51:33.19|
+---+----------------------+



## date_add

In [58]:
df2 = df.select('x', current_date().alias('current_date'))
df2.show(5)

+---+------------+
|  x|current_date|
+---+------------+
|  1|  2020-06-12|
|  2|  2020-06-12|
|  3|  2020-06-12|
|  4|  2020-06-12|
+---+------------+



In [59]:
df2.select('x', 'current_date', date_add(df2.current_date, 10)).show()

+---+------------+--------------------------+
|  x|current_date|date_add(current_date, 10)|
+---+------------+--------------------------+
|  1|  2020-06-12|                2020-06-22|
|  2|  2020-06-12|                2020-06-22|
|  3|  2020-06-12|                2020-06-22|
|  4|  2020-06-12|                2020-06-22|
+---+------------+--------------------------+



## date_format

In [61]:
df2.select("x","current_date",date_format("current_date","MM/dd/yyyy").alias("new_date")).show()

+---+------------+----------+
|  x|current_date|  new_date|
+---+------------+----------+
|  1|  2020-06-12|06/12/2020|
|  2|  2020-06-12|06/12/2020|
|  3|  2020-06-12|06/12/2020|
|  4|  2020-06-12|06/12/2020|
+---+------------+----------+

