# Chapter 6

In [1]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../pyspark-training/data/The-Definitive-Guide/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



## Converting to Spark Types

In [2]:
from pyspark.sql.functions import lit

In [3]:
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

## Working with Booleans

In [4]:
from pyspark.sql.functions import col

In [5]:
df.where(col("InvoiceNo") != 536365)\
    .select("InvoiceNo", "Description")\
    .show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



Cleaner option

In [6]:
df.where("InvoiceNo != 536365").show(5, False)

+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
+---------+---------+-----------------------------+--------+----

### Filter by multiple conditions
`instr(str, substr)` - Returns the (1-based) index of the first occurrence of substr in str. 

Examples: `SELECT instr('SparkSQL', 'SQL')`; 6

In [7]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 500
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



### Add a boolean column

In [8]:
DOTCodeFilter = col("StockCode") == "DOT"
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter)).show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|      false|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|      false|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|      false|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
only showing top 3 rows



## Working with Numbers
$Quantity = (the\ current\ quantity * the\ unit\ price)^2 + 5$

In [9]:
from pyspark.sql.functions import expr, pow

In [10]:
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In SQL expression

In [11]:
df.selectExpr("CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



### Rounding

In [12]:
from pyspark.sql.functions import lit, round, bround

In [13]:
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



### Correlation

In [14]:
from pyspark.sql.functions import corr

In [15]:
df.stat.corr("Quantity", "UnitPrice")

-0.04112314436835551

In [16]:
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



### Statistic Summary
`describe()` method

In [17]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

### Cross-tabulation and frequent item pairs

In [18]:
# df.stat.crosstab("StockCode", "Quantity").show()
df.stat.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



## Working with Strings

In [19]:
from pyspark.sql.functions import initcap, lower, upper

In [20]:
df.select(initcap(col("Description")), lower(col("Description")), upper(col("Description"))).show(2)

+--------------------+--------------------+--------------------+
|initcap(Description)|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|White Hanging Hea...|white hanging hea...|WHITE HANGING HEA...|
| White Metal Lantern| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows



### Regular Expression
#### Replace words

In [21]:
from pyspark.sql.functions import regexp_replace

In [22]:
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
    col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



#### Replace characters

In [23]:
from pyspark.sql.functions import translate

In [24]:
df.select(translate(col("Description"), "LEET", "1337"),
          col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



#### Pull out the first mentioned color

In [25]:
from pyspark.sql.functions import regexp_extract

In [26]:
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
          col("Description")).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



### [Hard] Return the interger location (1 based location)
See Page 96 for more details

In [27]:
from pyspark.sql.functions import expr, locate

In [28]:
simpleColors = ["black", "white", "red", "green", "blue"]

# convert a list of values with arbitrary length into a set of arguments
def color_locator(column, color_string):
    return locate(color_string.upper(), column).cast("boolean").alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
print(selectedColumns, '\n')
df.select(selectedColumns).where(expr("is_white OR is_red")).show(3, False)

# Add * to select all other columns
selectedColumns.append(expr("*")) 
df.select(selectedColumns).where(expr("is_white OR is_red")).show(3, False)

[Column<b'CAST(locate(BLACK, Description, 1) AS BOOLEAN) AS `is_black`'>, Column<b'CAST(locate(WHITE, Description, 1) AS BOOLEAN) AS `is_white`'>, Column<b'CAST(locate(RED, Description, 1) AS BOOLEAN) AS `is_red`'>, Column<b'CAST(locate(GREEN, Description, 1) AS BOOLEAN) AS `is_green`'>, Column<b'CAST(locate(BLUE, Description, 1) AS BOOLEAN) AS `is_blue`'>] 

+--------+--------+------+--------+-------+
|is_black|is_white|is_red|is_green|is_blue|
+--------+--------+------+--------+-------+
|false   |true    |false |false   |false  |
|false   |true    |false |false   |false  |
|false   |true    |true  |false   |false  |
+--------+--------+------+--------+-------+
only showing top 3 rows

+--------+--------+------+--------+-------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|is_black|is_white|is_red|is_green|is_blue|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|Cust

## Working with Dates and Timestamps
### Current Date and Time

In [29]:
from pyspark.sql.functions import current_date, current_timestamp

In [30]:
# Note how range() is used here to create a DataFrame
dateDF = spark.range(10).withColumn("today", current_date()).withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



### Subtract and Add Dates

In [31]:
from pyspark.sql.functions import date_add, date_sub

In [32]:
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2020-05-28|        2020-06-07|
+------------------+------------------+
only showing top 1 row



### Difference between Two Dates

In [33]:
from pyspark.sql.functions import datediff, months_between, to_date

In [34]:
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
    .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(to_date(lit("2016-01-01")).alias("start"),
              to_date(lit("2016-05-22")).alias("end"))\
    .select(months_between(col("start"), col("end"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                     -4.67741935|
+--------------------------------+
only showing top 1 row



### Deal with Different Date Formats 

In [35]:
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



### Timestamp

In [36]:
from pyspark.sql.functions import to_timestamp

In [37]:
dateDF.select(to_timestamp(col("today"))).show(1)

+---------------------+
|to_timestamp(`today`)|
+---------------------+
|  2020-06-02 00:00:00|
+---------------------+
only showing top 1 row



## Working with NULL in Data
### Coalesce

In [38]:
from pyspark.sql.functions import coalesce, isnull

In [39]:
df.select(coalesce(col("Description"), col("CustomerID"))).show(2)

+---------------------------------+
|coalesce(Description, CustomerID)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
+---------------------------------+
only showing top 2 rows



### drop

In [40]:
# df.na.drop()
# df.na.drop("any")
# df.na.drop("all")
# df.na.drop("all", subset=["StockCode", "InvoiceNo"])

### fill

In [41]:
fill_cols_vals = {"StockCode" : 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

### replace

In [42]:
df.na.replace([""], ["UNKNOWN"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

## Working with Complex Types
### Structs

In [43]:
from pyspark.sql.functions import struct

In [44]:
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.show(2)

+--------------------+
|             complex|
+--------------------+
|[WHITE HANGING HE...|
|[WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [45]:
complexDF.select("complex.Description").show(2)
complexDF.select(col("complex").getField("Description")).show(2)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [46]:
complexDF.select("complex.*").show(2)

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



### Arrays
#### split

In [47]:
from pyspark.sql.functions import split

In [48]:
df.select(split(col("Description"), " ")).show(2) # The new column is of complex type

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
+---------------------+
only showing top 2 rows



In [49]:
# Note that selectExpr() is used here
df.select(split(col("Description"), " ").alias("array_col")).selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



#### Array Length

In [50]:
from pyspark.sql.functions import size

In [51]:
df.select(size(split(col("Description"), " "))).show(2)

+---------------------------+
|size(split(Description,  ))|
+---------------------------+
|                          5|
|                          3|
+---------------------------+
only showing top 2 rows



#### array_contains

In [52]:
from pyspark.sql.functions import array_contains

In [53]:
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+--------------------------------------------+
|array_contains(split(Description,  ), WHITE)|
+--------------------------------------------+
|                                        true|
|                                        true|
+--------------------------------------------+
only showing top 2 rows



#### explode
The function makes DataFrame from wide format (in a column) into long format.

In [54]:
from pyspark.sql.functions import split, explode

In [55]:
df.withColumn("splitted", split(col("Description"), " "))\
    .withColumn("exploded", explode(col("splitted")))\
    .select("Description", "InvoiceNo", "exploded").show(2)

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
+--------------------+---------+--------+
only showing top 2 rows



### Maps

In [56]:
from pyspark.sql.functions import create_map

In [57]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|[WHITE HANGING HE...|
|[WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [58]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
    .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



In [59]:
# explode map types turns the map column into original columns
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
    .selectExpr("explode(complex_map)").show(2)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
+--------------------+------+
only showing top 2 rows



## Working with JSON
### Create a JSON column

In [60]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey": {"myJSONValue" : [1,2,3]}}' as jsonString
""")

### Get JSON Value

In [61]:
from pyspark.sql.functions import get_json_object, json_tuple

In [62]:
jsonDF.select(get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
              json_tuple(col("jsonString"), "myJSONKey")).show(2)

+------+--------------------+
|column|                  c0|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



### Turn a StructType into a JSON String

In [63]:
from pyspark.sql.functions import to_json

In [64]:
df.selectExpr("(InvoiceNo, Description) as myStruct")\
    .select(to_json(col("myStruct"))).show(2)

+-----------------------+
|structstojson(myStruct)|
+-----------------------+
|   {"InvoiceNo":"536...|
|   {"InvoiceNo":"536...|
+-----------------------+
only showing top 2 rows



### Turn a Map into a JSON String

In [65]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

In [66]:
parseSchema = StructType((
    StructField("InvoiceNo", StringType(), True),
    StructField("Description", StringType(), True)
))

df.selectExpr("(InvoiceNo, Description) as myStruct")\
    .select(to_json(col("myStruct")).alias("newJSON"))\
    .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+----------------------+--------------------+
|jsontostructs(newJSON)|             newJSON|
+----------------------+--------------------+
|  [536365, WHITE HA...|{"InvoiceNo":"536...|
|  [536365, WHITE ME...|{"InvoiceNo":"536...|
+----------------------+--------------------+
only showing top 2 rows



## User Defined Functions

### Create Function

In [67]:
def power3(double_value):
    return double_value ** 3
power3(2.0)

8.0

### Use UDF

In [68]:
udfExampleDF = spark.range(3).toDF("num")
udfExampleDF.select(power3(col("num"))).show()

+-------------+
|POWER(num, 3)|
+-------------+
|          0.0|
|          1.0|
|          8.0|
+-------------+



In [69]:
# regisger function before using it as an expression
spark.udf.register("power3py", power3)
udfExampleDF.selectExpr("power3py(num)").show()

+-------------+
|power3py(num)|
+-------------+
|            0|
|            1|
|            8|
+-------------+



### Define the return type of the function

In [70]:
from pyspark.sql.types import IntegerType, DoubleType

In [71]:
spark.udf.register("power3py", power3, DoubleType())
udfExampleDF.selectExpr("power3py(num)").show()

+-------------+
|power3py(num)|
+-------------+
|         null|
|         null|
|         null|
+-------------+

