In [0]:
spark

In [0]:
df1 = spark.read.format("csv")\
    .option("header","true")\
    .option("inferSchema","true")\
    .load("dbfs:/FileStore/shared_uploads/creationsbyyogesh@gmail.com/2010_12_01.csv")

In [0]:
df1.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
df1.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [0]:
df1.createOrReplaceTempView("df1Table")

### Initially Data will be in their Native Datatypes. We need to convert them into Spark Datatypes

In [0]:
from pyspark.sql.functions import lit
df1.select("*",lit(5)).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+---+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|  5|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+---+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|  5|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|  5|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|  5|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|  5|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|  5|
+---------+---------+--------------------+--------+-------------------+---------+----------+------------

In [0]:
from pyspark.sql.functions import expr
df1.select(lit(5)).show(5)

+---+
|  5|
+---+
|  5|
|  5|
|  5|
|  5|
|  5|
+---+
only showing top 5 rows



## Boolean

In [0]:
from pyspark.sql.functions import col
df1.where(col("InvoiceNo")!='536365')\
  .show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536366|    22633|HAND WARMER UNION...|       6|2010-12-01 08:28:00|     1.85|   17850.0|United Kingdom|
|   536366|    22632|HAND WARMER RED P...|       6|2010-12-01 08:28:00|     1.85|   17850.0|United Kingdom|
|   536367|    84879|ASSORTED COLOUR B...|      32|2010-12-01 08:34:00|     1.69|   13047.0|United Kingdom|
|   536367|    22745|POPPY'S PLAYHOUSE...|       6|2010-12-01 08:34:00|      2.1|   13047.0|United Kingdom|
|   536367|    22748|POPPY'S PLAYHOUSE...|       6|2010-12-01 08:34:00|      2.1|   13047.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import col
df1.where(col("InvoiceNo")!='536365')\
    .select("Description", "Country")\
    .show(5)

+--------------------+--------------+
|         Description|       Country|
+--------------------+--------------+
|HAND WARMER UNION...|United Kingdom|
|HAND WARMER RED P...|United Kingdom|
|ASSORTED COLOUR B...|United Kingdom|
|POPPY'S PLAYHOUSE...|United Kingdom|
|POPPY'S PLAYHOUSE...|United Kingdom|
+--------------------+--------------+
only showing top 5 rows



In [0]:
df1.where("InvoiceNo<>'536365'").show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536366|    22633|HAND WARMER UNION...|       6|2010-12-01 08:28:00|     1.85|   17850.0|United Kingdom|
|   536366|    22632|HAND WARMER RED P...|       6|2010-12-01 08:28:00|     1.85|   17850.0|United Kingdom|
|   536367|    84879|ASSORTED COLOUR B...|      32|2010-12-01 08:34:00|     1.69|   13047.0|United Kingdom|
|   536367|    22745|POPPY'S PLAYHOUSE...|       6|2010-12-01 08:34:00|      2.1|   13047.0|United Kingdom|
|   536367|    22748|POPPY'S PLAYHOUSE...|       6|2010-12-01 08:34:00|      2.1|   13047.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



### Write the following code using pyspark functions

SELECT * FROM dfTable WHERE StockCode in ("DOT") AND(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1)

In [0]:
from pyspark.sql.functions import instr
df1.where(col("StockCode").isin("DOT")).show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



PySpark AND operator for multiple conditions - &

In [0]:
df1.where(col("StockCode").isin("DOT"))\
    .where((col("UnitPrice")>600) & (instr(col("Description"),"POSTAGE")>1)).show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



PySpark OR operator for multiple conditions - |

In [0]:
from pyspark.sql.functions import instr
priceFilter = df1.UnitPrice > 600
descriptionFilter = instr(df1.Description,"POSTAGE")>=1
df1.where(col("StockCode").isin("DOT"))\
    .where(priceFilter | descriptionFilter).show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



## Numbers

In [0]:
from pyspark.sql.functions import col,expr,pow

In [0]:
fabricatedColumn = pow(col("Quantity") * col("UnitPrice"),2) + 5

In [0]:
df1.select(col("CustomerID"),fabricatedColumn.alias("TruePrice")).show(5)

+----------+------------------+
|CustomerID|         TruePrice|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import lit, round, bround
df1.select(round(lit(2.5)), bround(lit(2.5))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [0]:
df1.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

## Strings

In [0]:
from pyspark.sql.functions import lit, initcap, ltrim, rtrim, trim, lpad, rpad

In [0]:
df1.select(initcap(col("Description"))).show(5)

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
+--------------------+
only showing top 5 rows



lpad(col, len, pad)

rpad(col,len,pad)	lpad() – Add a specified character as padding on the left side.

rpad() – Add a specified character as padding on the right side.

In [0]:
string1="   Hello Madam   "
df1.select(ltrim(lit(string1)).alias("LTRIM")\
    , rtrim(lit(string1)).alias("RTRIM")\
    , trim(lit(string1)).alias("TRIM")\
    , lpad(lit(string1),3," ").alias("LPAD")\
    , rpad(lit(string1),10," ").alias("RPAD")
           ).show(2)

+--------------+--------------+-----------+----+----------+
|         LTRIM|         RTRIM|       TRIM|LPAD|      RPAD|
+--------------+--------------+-----------+----+----------+
|Hello Madam   |   Hello Madam|Hello Madam|    |   Hello M|
|Hello Madam   |   Hello Madam|Hello Madam|    |   Hello M|
+--------------+--------------+-----------+----+----------+
only showing top 2 rows



## Regex

In [0]:
from pyspark.sql.functions import col, translate

In [0]:
df1.select(translate(col("Description"),"LEAT","1347").alias("Replaced"),col("Description")).show(5)

+--------------------+--------------------+
|            Replaced|         Description|
+--------------------+--------------------+
|WHI73 H4NGING H34...|WHITE HANGING HEA...|
| WHI73 M3741 14N73RN| WHITE METAL LANTERN|
|CR34M CUPID H34R7...|CREAM CUPID HEART...|
|KNI773D UNION F14...|KNITTED UNION FLA...|
|R3D WOO11Y HO77I3...|RED WOOLLY HOTTIE...|
+--------------------+--------------------+
only showing top 5 rows



## Date & Timestamp

In [0]:
from pyspark.sql.functions import current_date, current_timestamp

In [0]:
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [0]:
dfDate = spark.range(10)\
    .withColumn("Today Date",current_date())\
    .withColumn("Time Now",current_timestamp())
dfDate.createOrReplaceTempView("dfDateTable")

In [0]:
dfDate.show()

+---+----------+--------------------+
| id|Today Date|            Time Now|
+---+----------+--------------------+
|  0|2025-03-29|2025-03-29 08:30:...|
|  1|2025-03-29|2025-03-29 08:30:...|
|  2|2025-03-29|2025-03-29 08:30:...|
|  3|2025-03-29|2025-03-29 08:30:...|
|  4|2025-03-29|2025-03-29 08:30:...|
|  5|2025-03-29|2025-03-29 08:30:...|
|  6|2025-03-29|2025-03-29 08:30:...|
|  7|2025-03-29|2025-03-29 08:30:...|
|  8|2025-03-29|2025-03-29 08:30:...|
|  9|2025-03-29|2025-03-29 08:30:...|
+---+----------+--------------------+



In [0]:
dfDate.printSchema()

root
 |-- id: long (nullable = false)
 |-- Today Date: date (nullable = false)
 |-- Time Now: timestamp (nullable = false)



**_date_add_**

**_date_sub_**

**_datediff_**

**_months_between_**

**_to_date_**

In [0]:
from pyspark.sql.functions import date_sub, date_add, datediff, months_between, to_date

In [0]:
dfDate.show()

+---+----------+--------------------+
| id|Today Date|            Time Now|
+---+----------+--------------------+
|  0|2025-03-29|2025-03-29 08:39:...|
|  1|2025-03-29|2025-03-29 08:39:...|
|  2|2025-03-29|2025-03-29 08:39:...|
|  3|2025-03-29|2025-03-29 08:39:...|
|  4|2025-03-29|2025-03-29 08:39:...|
|  5|2025-03-29|2025-03-29 08:39:...|
|  6|2025-03-29|2025-03-29 08:39:...|
|  7|2025-03-29|2025-03-29 08:39:...|
|  8|2025-03-29|2025-03-29 08:39:...|
|  9|2025-03-29|2025-03-29 08:39:...|
+---+----------+--------------------+



In [0]:
dfDate.select("*",date_sub(col("Today Date"),5).alias("5 Days Prior"),date_add(col("Today Date"),5).alias("5 Days Later")).show()

+---+----------+--------------------+------------+------------+
| id|Today Date|            Time Now|5 Days Prior|5 Days Later|
+---+----------+--------------------+------------+------------+
|  0|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  1|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  2|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  3|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  4|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  5|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  6|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  7|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  8|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
|  9|2025-03-29|2025-03-29 08:41:...|  2025-03-24|  2025-04-03|
+---+----------+--------------------+------------+------------+



**Difference between two dates**

In [0]:
randomDate = to_date(lit('2025-01-27'))
dfDate.select(col("Today Date"),datediff(randomDate,col("Today Date")).alias("Difference in Days")).show()

+----------+------------------+
|Today Date|Difference in Days|
+----------+------------------+
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
|2025-03-29|               -61|
+----------+------------------+



**Months Between two dates**

In [0]:
startDate = to_date(lit('2024-09-12'))
endDate = to_date(lit('2025-02-28'))
dfDate.select(months_between(startDate, endDate)).show()

+--------------------------------------------------------------+
|months_between(to_date(2024-09-12), to_date(2025-02-28), true)|
+--------------------------------------------------------------+
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
|                                                   -5.51612903|
+--------------------------------------------------------------+



**let’s take a look at the date format that has switched from year-month-day to year-day-month. Spark will fail to parse this date and silently return null instead**

In [0]:
dfDate.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

+-------------------+-------------------+
|to_date(2016-20-12)|to_date(2017-12-11)|
+-------------------+-------------------+
|               null|         2017-12-11|
+-------------------+-------------------+
only showing top 1 row



**_We find this to be an especially tricky situation for bugs because some dates might match the
correct format, whereas others do not. In the previous example, notice how the second date
appears as Decembers 11th instead of the correct day, November 12th. Spark doesn’t throw an
error because it cannot know whether the days are mixed up or that specific row is incorrect.
Let’s fix this pipeline, step by step, and come up with a robust way to avoid these issues entirely.
The first step is to remember that we need to specify our date format according to the Java
SimpleDateFormat standard.
We will use two functions to fix this: to_date and to_timestamp. The former optionally
expects a format, whereas the latter requires one:_**

In [0]:
dateFormat ='yyyy-dd-MM'
cleanedDfDate = dfDate.select(to_date(lit("2016-20-12"),dateFormat).alias("date1")\
    , to_date(lit("2017-12-11"),dateFormat).alias("date2"))
cleanedDfDate.show()

+----------+----------+
|     date1|     date2|
+----------+----------+
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
|2016-12-20|2017-11-12|
+----------+----------+



In [0]:
from pyspark.sql.functions import to_timestamp
cleanedDfDate.select(to_timestamp(col("date1"),dateFormat)).show()

+-------------------------------+
|to_timestamp(date1, yyyy-dd-MM)|
+-------------------------------+
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
|            2016-12-20 00:00:00|
+-------------------------------+



## Handling Null Values

When we declare a column as not having a null time, that is not actually enforced. To reiterate, when
you define a schema in which all columns are declared to not have null values, Spark will not enforce
that and will happily let null values into that column. The nullable signal is simply to help Spark SQL
optimize for handling that column. If you have null values in columns that should not have null values,
you can get an incorrect result or see strange exceptions that can be difficult to debug.

There are two things you can do with null values: you can explicitly drop nulls or you can fill
them with a value (globally or on a per-column basis).

**The simplest function is drop, which removes rows that contain nulls. The default is to drop any
row in which any value is null:**

In [0]:
df1.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
df1.na.drop("any")

Out[4]: DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

Specifying "any" as an argument drops a row if any of the values are null. Using “all” drops the
row only if all values are null or NaN for that row:

In [0]:
df1.na.drop("all")

Out[5]: DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

We can also apply this to certain sets of columns by passing in an array of columns:

In [0]:
df1.na.drop("all",subset=["StockCode","Quantity"])

Out[8]: DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

**fill**

**Using the fill function, you can fill one or more columns with a set of values. **

In [0]:
df1.na.fill("All NULL values become this string")

Out[9]: DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

For multiple columns and values this can be done by specifying a map—that is a particular value and a set of columns.

In [0]:
fill_na_cols = {"StockCode" : 5, "Description" : "No Value"}
df1.na.fill(fill_na_cols)

Out[11]: DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

## Working with Complex Types
There are three kinds of complex types: **structs, arrays,
and maps.**

**Structs**

You can think of structs as DataFrames within DataFrames. A worked example will illustrate
this more clearly. We can create a struct by wrapping a set of columns in parenthesis in a query:

In [0]:
from pyspark.sql.functions import struct
complexDf = df1.select(struct("StockCode","Description").alias("complex"))
complexDf.createOrReplaceTempView("complexDfTable")

In [0]:
complexDf.show(5)

+--------------------+
|             complex|
+--------------------+
|{85123A, WHITE HA...|
|{71053, WHITE MET...|
|{84406B, CREAM CU...|
|{84029G, KNITTED ...|
|{84029E, RED WOOL...|
+--------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import col
complexDf.select(col("complex").getField("StockCode")).show(5)

+-----------------+
|complex.StockCode|
+-----------------+
|           85123A|
|            71053|
|           84406B|
|           84029G|
|           84029E|
+-----------------+
only showing top 5 rows



**Arrays**

To define arrays, let’s work through a use case. With our current data, our objective is to take
every single word in our Description column and convert that into a row in our DataFrame.
The first task is to turn our Description column into a complex type, an array.

**split**

We do this by using the split function and specify the delimiter:

In [0]:
from pyspark.sql.functions import split
df1.select(split(col("Description")," ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



In [0]:
df1.select(split(col("Description")," ").alias("arrayDesc"))\
    .selectExpr("arrayDesc[0]").show(2)

+------------+
|arrayDesc[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



**ArrayLength & ArrayContains**

In [0]:
from pyspark.sql.functions import size, array_contains

In [0]:
df1.select(size(split(col("Description")," ")).alias("arraySize")).show(2)

+---------+
|arraySize|
+---------+
|        5|
|        3|
+---------+
only showing top 2 rows



In [0]:
df1.select(array_contains(split(col("Description")," "),"WHITE").alias("containsWhite")).show(2)

+-------------+
|containsWhite|
+-------------+
|         true|
|         true|
+-------------+
only showing top 2 rows



## Explode

### The explode function takes a column that consists of arrays and creates one row (with the rest of the values duplicated) per value in the array.

In [0]:
from pyspark.sql.functions import explode
df1.withColumn("splitted",split("Description"," "))\
    .withColumn("exploded",explode("splitted"))\
    .select(col("InvoiceNo"),col("splitted"),col("exploded")).show(10)

+---------+--------------------+--------+
|InvoiceNo|            splitted|exploded|
+---------+--------------------+--------+
|   536365|[WHITE, HANGING, ...|   WHITE|
|   536365|[WHITE, HANGING, ...| HANGING|
|   536365|[WHITE, HANGING, ...|   HEART|
|   536365|[WHITE, HANGING, ...| T-LIGHT|
|   536365|[WHITE, HANGING, ...|  HOLDER|
|   536365|[WHITE, METAL, LA...|   WHITE|
|   536365|[WHITE, METAL, LA...|   METAL|
|   536365|[WHITE, METAL, LA...| LANTERN|
|   536365|[CREAM, CUPID, HE...|   CREAM|
|   536365|[CREAM, CUPID, HE...|   CUPID|
+---------+--------------------+--------+
only showing top 10 rows



In [0]:
df1.withColumn("splitted",split("Description"," "))\
    .withColumn("exploded",explode("splitted"))\
    .withColumn("arraySize",size(col("splitted")))\
    .select(col("InvoiceNo"),col("Description"),col("arraySize"),col("splitted"),col("exploded")).show(10)

+---------+--------------------+---------+--------------------+--------+
|InvoiceNo|         Description|arraySize|            splitted|exploded|
+---------+--------------------+---------+--------------------+--------+
|   536365|WHITE HANGING HEA...|        5|[WHITE, HANGING, ...|   WHITE|
|   536365|WHITE HANGING HEA...|        5|[WHITE, HANGING, ...| HANGING|
|   536365|WHITE HANGING HEA...|        5|[WHITE, HANGING, ...|   HEART|
|   536365|WHITE HANGING HEA...|        5|[WHITE, HANGING, ...| T-LIGHT|
|   536365|WHITE HANGING HEA...|        5|[WHITE, HANGING, ...|  HOLDER|
|   536365| WHITE METAL LANTERN|        3|[WHITE, METAL, LA...|   WHITE|
|   536365| WHITE METAL LANTERN|        3|[WHITE, METAL, LA...|   METAL|
|   536365| WHITE METAL LANTERN|        3|[WHITE, METAL, LA...| LANTERN|
|   536365|CREAM CUPID HEART...|        5|[CREAM, CUPID, HE...|   CREAM|
|   536365|CREAM CUPID HEART...|        5|[CREAM, CUPID, HE...|   CUPID|
+---------+--------------------+---------+---------

## Maps

Maps are created by using the map function and key-value pairs of columns. You then can select
them just like you might select from an array:

In [0]:
from pyspark.sql.functions import create_map
df1.select(create_map(col("Description"),col("InvoiceNo")).alias("complexMap"))\
    .selectExpr("complexMap['WHITE METAL LANTERN']").show(5)

+-------------------------------+
|complexMap[WHITE METAL LANTERN]|
+-------------------------------+
|                           null|
|                         536365|
|                           null|
|                           null|
|                           null|
+-------------------------------+
only showing top 5 rows



You can also explode map types, which will turn them into columns:

In [0]:
df1.select(create_map(col("Description"),col("InvoiceNo")).alias("complexMap"))\
  .select(explode(col("complexMap"))).show(5)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
|CREAM CUPID HEART...|536365|
|KNITTED UNION FLA...|536365|
|RED WOOLLY HOTTIE...|536365|
+--------------------+------+
only showing top 5 rows



## Working with JSON

Spark has some unique support for working with JSON data. You can operate directly on strings of JSON in Spark and parse from JSON or extract JSON objects. Let’s begin by creating a JSON column:

In [0]:
jsonDf = spark.range(1).selectExpr(""" '{"myJsonKey" : {"myJsonValue" : [1,2,3]}}' as jsonString """)

In [0]:
from pyspark.sql.functions import get_json_object, json_tuple
jsonDf.select(get_json_object(col("jsonString"),"$.myJsonKey.myJsonValue[1]").alias("column")).show(1)

+------+
|column|
+------+
|     2|
+------+



In [0]:
jsonDf.select(json_tuple(col("jsonString"),"myJsonKey")).show(2)

+--------------------+
|                  c0|
+--------------------+
|{"myJsonValue":[1...|
+--------------------+

