In [47]:
import findspark
findspark.init()
import pyspark
findspark.find()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
conf = pyspark.SparkConf().setAppName('appName').setMaster('locals')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [49]:
df = spark.read.format("csv")\
.option("header","true")\
.option("inferSchema","true")\
.load("C:/Users/pilla/Documents/Spark-The-Definitive-Guide-master/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



#### LITERALS

In [51]:
from pyspark.sql.functions import lit
df.select(lit(5),lit("five"),lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

#### Booleans in Spark

Boolean consists of four elements: and , or, true and false

In [52]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo")!=536365)\
.select("InvoiceNo","Description")\
.show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



#### Another way to filter by specifying predicate as an expression in the string

In [53]:
df.where("InvoiceNo <>536365").show(5,False)

+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
+---------+---------+-----------------------------+--------+----

In [54]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice")>600
descripFilter = instr(df.Description,"POSTAGE") >=1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [55]:
DOTCodeFilter = col("StockCode")=="DOT"
priceFilter = col("UnitPrice")>600
descripFilter= instr(col("Description"),"POSTAGE") >=1
df.withColumn("isExpensive",DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice","isExpensive").show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [56]:
from pyspark.sql.functions import expr
df.withColumn("isExpensive",expr("NOT UnitPrice <=250"))\
.where("isExpensive")\
.select("Description","UnitPrice").show()

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



#### Working with Numbers

In [57]:
from pyspark.sql.functions import expr,pow
fabricatedQuantity = pow(col("Quantity")* col("UnitPrice"),2) +5
df.select(expr("CustomerId"),fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [58]:
### Using SQL to do the same

df.selectExpr("CustomerId",
             "(POWER((Quantity * UnitPrice),2)+ 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



#### Round Function

In [59]:
from pyspark.sql.functions import round,lit,bround
df.select(round(lit("2.5")),bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



#### Correlation

In [60]:
from pyspark.sql.functions import corr
df.select(corr("Quantity","UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



#### Summary Statistics

In [61]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+-------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|        InvoiceDate|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+-------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|               3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128|               null| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|               null|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             

#### Statistics in Pyspark

In [62]:
from pyspark.sql.functions import count,mean,stddev_pop,min,max
colName ="UnitPrice"
quantileProbs =[0.5]
relError =0.05
df.stat.approxQuantile("UnitPrice",quantileProbs,relError)

[2.51]

In [63]:
df.stat.crosstab("StockCode","Quantity").show()

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [64]:
df.stat.freqItems(["StockCode","Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



#### Add Unique ID (Monotonically Increasing)

In [65]:
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows



#### Working with Strings

##### InitCap Function

In [66]:
from pyspark.sql.functions import initcap,upper,lower

df.select(initcap(col("Description"))).show()

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
|Set 7 Babushka Ne...|
|Glass Star Froste...|
|Hand Warmer Union...|
|Hand Warmer Red P...|
|Assorted Colour B...|
|Poppy's Playhouse...|
|Poppy's Playhouse...|
|Feltcraft Princes...|
|Ivory Knitted Mug...|
|Box Of 6 Assorted...|
|Box Of Vintage Ji...|
|Box Of Vintage Al...|
|Home Building Blo...|
|Love Building Blo...|
|Recipe Box With M...|
+--------------------+
only showing top 20 rows



#### Upper and Lower Cases

In [67]:
df.select(col("Description"),upper(col("Description")),lower(col("Description")),upper(lower(col("Description")))).show(2)

+--------------------+--------------------+--------------------+-------------------------+
|         Description|  upper(Description)|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+-------------------------+
only showing top 2 rows



#### lpad, ltrim, rpad,rtrim, trim

In [68]:
from pyspark.sql.functions import lit,ltrim,lpad,rpad,rtrim,trim
df.select(
ltrim(lit("      HELLO        ")).alias("ltrim"),
rtrim(lit("      HELLO        ")).alias("rtrim"),
trim(lit("      HELLO        ")).alias("trim"),
lpad(lit("HELLO"),3,"").alias("lpad"),
rpad(lit("HELLO"),10,"").alias("rpad")
).show(2)

+-------------+-----------+-----+----+-----+
|        ltrim|      rtrim| trim|lpad| rpad|
+-------------+-----------+-----+----+-----+
|HELLO        |      HELLO|HELLO| HEL|HELLO|
|HELLO        |      HELLO|HELLO| HEL|HELLO|
+-------------+-----------+-----+----+-----+
only showing top 2 rows



#### Regex Replace

In [72]:
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
regexp_replace(col("Description"),regex_string,"COLOR").alias("color_clean"), col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



#### Translate

In [73]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"),"LEFT","1337"),col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEFT, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



#### Regexp Extract

In [78]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"),extract_str,1).alias("color_clean"),col("Description")).show(15, False)

+-----------+-----------------------------------+
|color_clean|Description                        |
+-----------+-----------------------------------+
|WHITE      |WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE      |WHITE METAL LANTERN                |
|           |CREAM CUPID HEARTS COAT HANGER     |
|           |KNITTED UNION FLAG HOT WATER BOTTLE|
|RED        |RED WOOLLY HOTTIE WHITE HEART.     |
|           |SET 7 BABUSHKA NESTING BOXES       |
|           |GLASS STAR FROSTED T-LIGHT HOLDER  |
|           |HAND WARMER UNION JACK             |
|RED        |HAND WARMER RED POLKA DOT          |
|           |ASSORTED COLOUR BIRD ORNAMENT      |
|           |POPPY'S PLAYHOUSE BEDROOM          |
|           |POPPY'S PLAYHOUSE KITCHEN          |
|           |FELTCRAFT PRINCESS CHARLOTTE DOLL  |
|           |IVORY KNITTED MUG COSY             |
|           |BOX OF 6 ASSORTED COLOUR TEASPOONS |
+-----------+-----------------------------------+
only showing top 15 rows



#### Contains

In [80]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"),"BLACK") >=1
containsWhite = instr(col("Description"),"WHITE") >=1
df.withColumn("hasSimpleColor",containsBlack| containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



#### Dates and Time

In [90]:
from pyspark.sql.functions import current_date, current_timestamp
dateDF= spark.range(10)\
    .withColumn("today",current_date())\
    .withColumn("now",current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [91]:
dateDF.show(10, False)

+---+----------+-----------------------+
|id |today     |now                    |
+---+----------+-----------------------+
|0  |2020-04-23|2020-04-23 21:18:26.715|
|1  |2020-04-23|2020-04-23 21:18:26.715|
|2  |2020-04-23|2020-04-23 21:18:26.715|
|3  |2020-04-23|2020-04-23 21:18:26.715|
|4  |2020-04-23|2020-04-23 21:18:26.715|
|5  |2020-04-23|2020-04-23 21:18:26.715|
|6  |2020-04-23|2020-04-23 21:18:26.715|
|7  |2020-04-23|2020-04-23 21:18:26.715|
|8  |2020-04-23|2020-04-23 21:18:26.715|
|9  |2020-04-23|2020-04-23 21:18:26.715|
+---+----------+-----------------------+



#### Date_sub and Date_Add

In [94]:
#### Add and Subtract 5 days from Date

from pyspark.sql.functions import date_add,date_sub
dateDF.select(date_sub(col("today"),5), date_add(col("today"),5)).show(5)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2020-04-18|        2020-04-28|
|        2020-04-18|        2020-04-28|
|        2020-04-18|        2020-04-28|
|        2020-04-18|        2020-04-28|
|        2020-04-18|        2020-04-28|
+------------------+------------------+
only showing top 5 rows



#### Date_Diff, Months_between and to_date

In [103]:
from pyspark.sql.functions import datediff,months_between,to_date,to_timestamp
dateDF.withColumn("week_ago",date_sub(col("today"),7))\
.select(datediff(col("week_ago"),col("today"))).show(3)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
|                       -7|
+-------------------------+
only showing top 3 rows



In [99]:
###to_date converts string to date

spark.range(5).withColumn("date",lit("2018-05-01"))\
.withColumn("date2",lit("2018-15-01"))\
.select(to_date(col("date")),to_date(col("date2"))).show(5)

+---------------+----------------+
|to_date(`date`)|to_date(`date2`)|
+---------------+----------------+
|     2018-05-01|            null|
|     2018-05-01|            null|
|     2018-05-01|            null|
|     2018-05-01|            null|
|     2018-05-01|            null|
+---------------+----------------+



#### Need for formatting specification

In [109]:
date_format = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
to_date(lit("2018-05-01"),date_format).alias("date"),
to_date(lit("2018-15-01"),date_format).alias("date2"))
cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2018-01-05|2018-01-15|
+----------+----------+



#### to_timestamp

In [108]:
cleanDateDF.select(to_timestamp(col("date"),date_format)).show()

+----------------------------------+
|to_timestamp(`date`, 'yyyy-dd-MM')|
+----------------------------------+
|               2018-01-05 00:00:00|
+----------------------------------+



In [116]:
from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"),col("CustomerId"))).show()

+---------------------------------+
|coalesce(Description, CustomerId)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
|             SET 7 BABUSHKA NE...|
|             GLASS STAR FROSTE...|
|             HAND WARMER UNION...|
|             HAND WARMER RED P...|
|             ASSORTED COLOUR B...|
|             POPPY'S PLAYHOUSE...|
|             POPPY'S PLAYHOUSE...|
|             FELTCRAFT PRINCES...|
|             IVORY KNITTED MUG...|
|             BOX OF 6 ASSORTED...|
|             BOX OF VINTAGE JI...|
|             BOX OF VINTAGE AL...|
|             HOME BUILDING BLO...|
|             LOVE BUILDING BLO...|
|             RECIPE BOX WITH M...|
+---------------------------------+
only showing top 20 rows



In [121]:
spark.sql("""select ifnull(null,'return_value') AS if_null,
nullif('value','value') as null_if,
nvl(null,'return_value') as NVL,
nvl2('not_null','return_Value','else_value') AS NVL_2
from dfTable LIMIT 1""").show()

+------------+-------+------------+------------+
|     if_null|null_if|         NVL|       NVL_2|
+------------+-------+------------+------------+
|return_value|   null|return_value|return_Value|
+------------+-------+------------+------------+



#### Drop

In [122]:
df.na.drop("all",subset = ["StockCode","InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

#### Fill

In [123]:
df.na.fill("all",subset = ["StockCode","InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

#### Fill custom values

In [124]:
fill_cols_vals = {"StockCode":5, "Description":"No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

#### Replace

In [125]:
df.na.replace([""],["Unknown"],"Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

#### Struct

In [126]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description","InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [130]:
complexDF.select("complex.Description").show(3)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
+--------------------+
only showing top 3 rows



In [131]:
complexDF.select(col("complex").getField("Description")).show(3)

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
+--------------------+
only showing top 3 rows



#### Arrays

#### Split Function

In [132]:
from pyspark.sql.functions import split
df.select(split(col("Description")," ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



In [135]:
df.select(split(col("Description")," ").alias("array_col"))\
.selectExpr("array_col[0]").show(3)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
|       CREAM|
+------------+
only showing top 3 rows



#### Array Length

In [137]:
from pyspark.sql.functions import size
df.select(size(split(col("Description")," "))).show(2)

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows



#### Array Contains

In [139]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description")," "),"WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



#### Explode

In [141]:
from pyspark.sql.functions import explode

df.withColumn("splitted",split(col("Description")," "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description","InvoiceNo","exploded").show(3, False)

+----------------------------------+---------+--------+
|Description                       |InvoiceNo|exploded|
+----------------------------------+---------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |WHITE   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HANGING |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HEART   |
+----------------------------------+---------+--------+
only showing top 3 rows



#### Maps

In [148]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"),col("InvoiceNo")).alias("complex_map"))\
.show(2)

+--------------------+
|         complex_map|
+--------------------+
|[WHITE HANGING HE...|
|[WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



#### Querying a Map using Key

In [151]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
         .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



In [157]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
+--------------------+------+
only showing top 2 rows



#### JSON

In [158]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue":[1,2,3]}}' as jsonString""")

In [160]:
from pyspark.sql.functions import get_json_object,json_tuple
jsonDF.select(
get_json_object(col("jsonString"),"$.myJSONKey.myJSONValue[1]").alias("column"),
json_tuple(col("jsonString"),"myJSONKey")).show(2)

+------+--------------------+
|column|                  c0|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



#### Struct to JSON String

In [162]:
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo,Description) as myStruct")\
.select(to_json(col("myStruct"))).show(2)

+--------------------+
|   to_json(myStruct)|
+--------------------+
|{"InvoiceNo":"536...|
|{"InvoiceNo":"536...|
+--------------------+
only showing top 2 rows



In [163]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo,Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"),parseSchema), col("newJSON")).show(2)

+--------------------+--------------------+
|  from_json(newJSON)|             newJSON|
+--------------------+--------------------+
|[536365, WHITE HA...|{"InvoiceNo":"536...|
|[536365, WHITE ME...|{"InvoiceNo":"536...|
+--------------------+--------------------+
only showing top 2 rows



#### UDF

In [164]:
udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
    return double_value **3
power3(2.0)

8.0

In [165]:
from pyspark.sql.functions import udf
power3udf = udf(power3)
udfExampleDF.select(power3udf(col("num"))).show(2)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
+-----------+
only showing top 2 rows



In [173]:
from pyspark.sql.types import IntegerType,DoubleType
spark.udf.register("power3py",power3,DoubleType())

udfExampleDF.selectExpr("power3py(num)").show(2)

+-------------+
|power3py(num)|
+-------------+
|         null|
|         null|
+-------------+
only showing top 2 rows

