# Chapter 6

In [2]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../pyspark-training/data/The-Definitive-Guide/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



## Converting to Spark Types

In [5]:
from pyspark.sql.functions import lit

In [7]:
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

## Working with Booleans

In [8]:
from pyspark.sql.functions import col

In [9]:
df.where(col("InvoiceNo") != 536365)\
    .select("InvoiceNo", "Description")\
    .show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



Cleaner option

In [11]:
df.where("InvoiceNo != 536365").show(5, False)

+---------+---------+-----------------------------+--------+---------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate          |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+---------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00.0|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00.0|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00.0|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00.0|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00.0|2.1      |13047.0   |United Kingdom|
+---------+---------+---------------------------

### Filter by multiple conditions
`instr(str, substr)` - Returns the (1-based) index of the first occurrence of substr in str. 

Examples: `SELECT instr('SparkSQL', 'SQL')`; 6

In [14]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 500
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+--------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|         InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+--------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:...|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:...|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+--------------------+---------+----------+--------------+



### Add a boolean column

In [17]:
DOTCodeFilter = col("StockCode") == "DOT"
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter)).show(3)

+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|         InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+-----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:...|     2.55|   17850.0|United Kingdom|      false|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:...|     3.39|   17850.0|United Kingdom|      false|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:...|     2.75|   17850.0|United Kingdom|      false|
+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+-----------+
only showing top 3 rows



## Working with Numbers
$Quantity = (the\ current\ quantity * the\ unit\ price)^2 + 5$

In [19]:
from pyspark.sql.functions import expr, pow

In [20]:
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In SQL expression

In [21]:
df.selectExpr("CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



### Rounding

In [22]:
from pyspark.sql.functions import lit, round, bround

In [24]:
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



### Correlation

In [25]:
from pyspark.sql.functions import corr

In [26]:
df.stat.corr("Quantity", "UnitPrice")

-0.04112314436835551

In [27]:
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



### Statistic Summary
`describe()` method

In [28]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|          Quantity|         UnitPrice|        CustomerID|
+-------+------------------+------------------+------------------+
|  count|              3108|              3108|              1968|
|   mean| 8.627413127413128| 4.151946589446603|15661.388719512195|
| stddev|26.371821677029203|15.638659854603892|1854.4496996893627|
|    min|               -24|               0.0|           12431.0|
|    max|               600|            607.49|           18229.0|
+-------+------------------+------------------+------------------+



### Cross-tabulation and frequent item pairs

In [30]:
# df.stat.crosstab("StockCode", "Quantity").show()
df.stat.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 50...|
+--------------------+--------------------+



## Working with Strings

In [34]:
from pyspark.sql.functions import initcap, lower, upper

In [35]:
df.select(initcap(col("Description")), lower(col("Description")), upper(col("Description"))).show(2)

+--------------------+--------------------+--------------------+
|initcap(Description)|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|White Hanging Hea...|white hanging hea...|WHITE HANGING HEA...|
| White Metal Lantern| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows



### Regular Expression
#### Replace words

In [38]:
from pyspark.sql.functions import regexp_replace

In [39]:
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
    col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



#### Replace characters

In [40]:
from pyspark.sql.functions import translate

In [41]:
df.select(translate(col("Description"), "LEET", "1337"),
          col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



#### Pull out the first mentioned color

In [42]:
from pyspark.sql.functions import regexp_extract

In [43]:
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
          col("Description")).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



#### [Hard] Return the interger location (1 based location)
See Page 96 for more details

In [44]:
from pyspark.sql.functions import expr, locate

In [70]:
simpleColors = ["black", "white", "red", "green", "blue"]

# convert a list of values with arbitrary length into a set of arguments
def color_locator(column, color_string):
    return locate(color_string.upper(), column).cast("boolean").alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
print(selectedColumns, '\n')
df.select(selectedColumns).where(expr("is_white OR is_red")).show(3, False)

# Add * to select all other columns
selectedColumns.append(expr("*")) 
df.select(selectedColumns).where(expr("is_white OR is_red")).show(3, False)

[Column<b'CAST(locate(BLACK, Description, 1) AS BOOLEAN) AS `is_black`'>, Column<b'CAST(locate(WHITE, Description, 1) AS BOOLEAN) AS `is_white`'>, Column<b'CAST(locate(RED, Description, 1) AS BOOLEAN) AS `is_red`'>, Column<b'CAST(locate(GREEN, Description, 1) AS BOOLEAN) AS `is_green`'>, Column<b'CAST(locate(BLUE, Description, 1) AS BOOLEAN) AS `is_blue`'>] 

+--------+--------+------+--------+-------+
|is_black|is_white|is_red|is_green|is_blue|
+--------+--------+------+--------+-------+
|false   |true    |false |false   |false  |
|false   |true    |false |false   |false  |
|false   |true    |true  |false   |false  |
+--------+--------+------+--------+-------+
only showing top 3 rows

+--------+--------+------+--------+-------+---------+---------+----------------------------------+--------+---------------------+---------+----------+--------------+
|is_black|is_white|is_red|is_green|is_blue|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate          |UnitPrice|