In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName("My app").getOrCreate()

In [3]:
df = spark.read.format("csv")\
    .option("header","true")\
    .option("inferSchema","true")\
    .load("retail-data/by-day/2010-12-01.csv")
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



### Boolean

In [4]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo")!=56365)\
    .select("InvoiceNo","Description")\
    .show(5,False)


+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import instr

priceFilter=col("UnitPrice")>600
descripFilter=instr(df.Description,"POSTAGE")>=1

df.where(df.StockCode.isin("DOT"))\
    .where(priceFilter | descripFilter)\
    .show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [6]:
from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode")=="DOT"
priceFilter=col("UnitPrice")>600
descripFilter=instr(col("Description"),"POSTAGE") >=1

df.withColumn("isExpensive",DOTCodeFilter &(priceFilter|descripFilter))\
    .where("isExpensive")\
    .select("unitPrice","isExpensive","Description")\
    .show(5)

+---------+-----------+--------------+
|unitPrice|isExpensive|   Description|
+---------+-----------+--------------+
|   569.77|       true|DOTCOM POSTAGE|
|   607.49|       true|DOTCOM POSTAGE|
+---------+-----------+--------------+



In [7]:
from pyspark.sql.functions import expr

df.withColumn("isExpensive",expr("NOT UnitPrice <=100"))\
    .where("isExpensive")\
    .select("Description","UnitPrice").show(5)

+--------------------+---------+
|         Description|UnitPrice|
+--------------------+---------+
|RUSTIC  SEVENTEEN...|    165.0|
|      DOTCOM POSTAGE|   569.77|
|      DOTCOM POSTAGE|   607.49|
+--------------------+---------+



### Numbers

In [8]:
from pyspark.sql.functions import expr,pow

fabricatedQuantity = pow(col("Quantity")*col("UnitPrice"),2)

df.select(
    expr("CustomerId"),
    fabricatedQuantity.alias("realQuantity"))\
    .show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|234.08999999999997|
|   17850.0|          413.7156|
+----------+------------------+
only showing top 2 rows



In [9]:
df.selectExpr(
    "CustomerId",
    "(POWER((Quantity * UnitPrice),2.0)+5)as realQuantity")\
    .show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [10]:
from pyspark.sql.functions import lit,round,bround

df.select(
    round(lit("2.5")),
    bround(lit("2.5")))\
    .show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [11]:
#Pearson Correlation coefficient
from pyspark.sql.functions import corr

df.stat.corr("Quantity","UnitPrice")
df.select(corr("Quantity","UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [12]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                NULL| 8.627413127413128| 4.151946589446603|15661.388719512195|          NULL|
| stddev|72.89447869788873|17407.897548583845|                NULL|26.371821677029203|15.638659854603892|1854.4496996893627|          NULL|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [13]:
from pyspark.sql.functions import count,mean,stddev_pop,min

In [14]:
colName="UnitPrice"
quantileProbs=[0.5]
relError=0.05
df.stat.approxQuantile("UnitPrice",quantileProbs,relError)

[2.51]

In [15]:
df.stat.crosstab("StockCode","Quantity").show()

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             21259|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  2|  0|  0|  0|  0| 

In [16]:
df.stat.freqItems(["StockCode","Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[22086, 21705, 72...|[200, 128, 23, 50...|
+--------------------+--------------------+



### Strings

In [17]:
from pyspark.sql.functions import initcap

df.select(initcap(col("Description"))).show()

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
|Set 7 Babushka Ne...|
|Glass Star Froste...|
|Hand Warmer Union...|
|Hand Warmer Red P...|
|Assorted Colour B...|
|Poppy's Playhouse...|
|Poppy's Playhouse...|
|Feltcraft Princes...|
|Ivory Knitted Mug...|
|Box Of 6 Assorted...|
|Box Of Vintage Ji...|
|Box Of Vintage Al...|
|Home Building Blo...|
|Love Building Blo...|
|Recipe Box With M...|
+--------------------+
only showing top 20 rows



In [18]:
from pyspark.sql.functions import lower,upper

df.select(
    col("Description"),
    lower(col("Description")),
    upper(col("Description")))\
    .show(2)

+--------------------+--------------------+--------------------+
|         Description|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows



In [19]:
from pyspark.sql.functions import lit,ltrim,rtrim,rpad,lpad,trim

df.select(
    ltrim(lit("   HELLO    ")).alias("ltrim"),
    rtrim(lit("   HELLO    ")).alias("rtrim"),
    trim(lit("   HELLO    ")).alias("trim"),
    lpad(lit("HELLO"),3," ").alias("lp"),
    rpad(lit("HELLO"),10," ").alias("rp"))\
    .show(2)

+---------+--------+-----+---+----------+
|    ltrim|   rtrim| trim| lp|        rp|
+---------+--------+-----+---+----------+
|HELLO    |   HELLO|HELLO|HEL|HELLO     |
|HELLO    |   HELLO|HELLO|HEL|HELLO     |
+---------+--------+-----+---+----------+
only showing top 2 rows



### Regular Expressions

In [20]:
#regular expression
from pyspark.sql.functions import regexp_replace

regex_string ="BLACK|WHITE|RED|GREEN|BLUE"

df.select(
    regexp_replace(col("Description"),regex_string,"COLOR")
    .alias("color_cleaned"),
    col("Description"))\
   .show(2)

+--------------------+--------------------+
|       color_cleaned|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [21]:
from pyspark.sql.functions import translate

df.select(
    translate(col("Description"),"LEET","1337"),
    col("Description"))\
   .show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [22]:
from pyspark.sql.functions import regexp_extract

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    regexp_extract(col("Description"),extract_str,1)
    .alias("color_cleaned"),
    col("Description"))\
   .show(2)

+-------------+--------------------+
|color_cleaned|         Description|
+-------------+--------------------+
|        WHITE|WHITE HANGING HEA...|
|        WHITE| WHITE METAL LANTERN|
+-------------+--------------------+
only showing top 2 rows



In [23]:
from pyspark.sql.functions import instr

containsBlack = instr(col("Description"),"BLACK")>=1
containsWhite = instr(col("Description"),"WHITE")>=1

df.withColumn("hasSimpleColor",containsBlack|containsWhite)\
    .filter("hasSimpleColor")\
    .select("Description")\
    .show(3,False)


+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [24]:
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    """This function creates a column declaring whether or
    not a given pySpark column contains the UPPERCASED
    color.
    Returns a new column type that can be used
    in a select statement.
    """
    return locate(color_string.upper(), column)\
        .cast("boolean")\
        .alias("is_" + color_string)  # ← Correct: use color_string, not c

# ✅ Fix 1: Close the list comprehension properly
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*"))  # Include all other columns too

# ✅ Working Query
df\
    .select(*selectedColumns)\
    .where(expr("is_white OR is_red"))\
    .select("Description")\
    .show(3, False)


+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



### Dates and Timestamps

In [25]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [25]:
#Date and timestamp

from pyspark.sql.functions import current_date,current_timestamp

dateDF= spark.range(10)\
    .withColumn("today",current_date())\
    .withColumn("now",current_timestamp())

dateDF.createOrReplaceTempView("dateTable")

dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [26]:
dateDF.show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2025-07-22|2025-07-22 14:49:...|
|  1|2025-07-22|2025-07-22 14:49:...|
|  2|2025-07-22|2025-07-22 14:49:...|
|  3|2025-07-22|2025-07-22 14:49:...|
|  4|2025-07-22|2025-07-22 14:49:...|
|  5|2025-07-22|2025-07-22 14:49:...|
|  6|2025-07-22|2025-07-22 14:49:...|
|  7|2025-07-22|2025-07-22 14:49:...|
|  8|2025-07-22|2025-07-22 14:49:...|
|  9|2025-07-22|2025-07-22 14:49:...|
+---+----------+--------------------+



In [27]:
from pyspark.sql.functions import date_add,date_sub

dateDF\
    .select(
        date_sub(col("today"),5),
        date_add(col("today"),5))\
    .show()

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
|        2025-07-17|        2025-07-27|
+------------------+------------------+



In [28]:
from pyspark.sql.functions import datediff,months_between,to_date

dateDF\
    .withColumn("week_ago",date_sub(col("today"),7))\
    .select(datediff(col("week_ago"),col("today")))\
    .show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row



In [29]:
dateDF\
    .select(
        to_date(lit("2016-01-01")).alias("start"),
        to_date(lit("2017-05-22")).alias("end"))\
    .select(months_between(col("start"),col("end")))\
    .show(1)
        
        

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [30]:
from pyspark.sql.functions import to_date,lit

spark.range(5).withColumn("date",lit("2017-01-01"))\
    .select(to_date(col("date")))\
    .show()

+-------------+
|to_date(date)|
+-------------+
|   2017-01-01|
|   2017-01-01|
|   2017-01-01|
|   2017-01-01|
|   2017-01-01|
+-------------+



In [31]:
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show()

+-------------------+-------------------+
|to_date(2016-20-12)|to_date(2017-12-11)|
+-------------------+-------------------+
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
|               NULL|         2017-12-11|
+-------------------+-------------------+



In [32]:
#for fixing above date bug we can use unix_timestamp
from pyspark.sql.functions import unix_timestamp

dateFormat="yyyy-dd-MM"

cleanDateDF= spark.range(1)\
    .select(
        to_date(unix_timestamp(lit("2017-12-11"),dateFormat).cast("timestamp")
               ).alias("date"),
        to_date(unix_timestamp(lit("2017-20-12"),dateFormat).cast("timestamp")
               ).alias("date2")
    )

cleanDateDF.show()

                

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [33]:
cleanDateDF\
    .select(
        unix_timestamp(col("date"),dateFormat).cast("timestamp"))\
    .show()

+---------------------------------------------------+
|CAST(unix_timestamp(date, yyyy-dd-MM) AS TIMESTAMP)|
+---------------------------------------------------+
|                                2017-11-12 00:00:00|
+---------------------------------------------------+



### Working with Nulls in Data

In [34]:
#Working with Nulls in Data
#Drop
df.na.drop()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [35]:
df.na.drop("any")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [36]:
df.na.drop("all",subset=["StockCode","InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [37]:
#Fill
df.na.fill("All Null Values become this string")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [38]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [39]:
df.na.fill("all",subset=["StockCode","InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [40]:
fill_cols_vals= {
    "StockCode":5,
    "Description":"No Value"
}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

## Replace

In [41]:
#replace
df.na.replace([""],["UNKNOWN"],"Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

## Complex types: structs, arrays and maps

## Structs
DataFrames within DataFrames

In [42]:
from pyspark.sql.functions import struct
complexDF=df\
    .select(struct("Description","InvoiceNo").alias("complex"))

complexDF.show(10,False)

+---------------------------------------------+
|complex                                      |
+---------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365} |
|{WHITE METAL LANTERN, 536365}                |
|{CREAM CUPID HEARTS COAT HANGER, 536365}     |
|{KNITTED UNION FLAG HOT WATER BOTTLE, 536365}|
|{RED WOOLLY HOTTIE WHITE HEART., 536365}     |
|{SET 7 BABUSHKA NESTING BOXES, 536365}       |
|{GLASS STAR FROSTED T-LIGHT HOLDER, 536365}  |
|{HAND WARMER UNION JACK, 536366}             |
|{HAND WARMER RED POLKA DOT, 536366}          |
|{ASSORTED COLOUR BIRD ORNAMENT, 536367}      |
+---------------------------------------------+
only showing top 10 rows



In [43]:
complexDF.select("complex.Description").show()

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|
|HAND WARMER UNION...|
|HAND WARMER RED P...|
|ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE...|
|FELTCRAFT PRINCES...|
|IVORY KNITTED MUG...|
|BOX OF 6 ASSORTED...|
|BOX OF VINTAGE JI...|
|BOX OF VINTAGE AL...|
|HOME BUILDING BLO...|
|LOVE BUILDING BLO...|
|RECIPE BOX WITH M...|
+--------------------+
only showing top 20 rows



In [57]:
complexDF.select("complex.*").show()

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
|CREAM CUPID HEART...|   536365|
|KNITTED UNION FLA...|   536365|
|RED WOOLLY HOTTIE...|   536365|
|SET 7 BABUSHKA NE...|   536365|
|GLASS STAR FROSTE...|   536365|
|HAND WARMER UNION...|   536366|
|HAND WARMER RED P...|   536366|
|ASSORTED COLOUR B...|   536367|
|POPPY'S PLAYHOUSE...|   536367|
|POPPY'S PLAYHOUSE...|   536367|
|FELTCRAFT PRINCES...|   536367|
|IVORY KNITTED MUG...|   536367|
|BOX OF 6 ASSORTED...|   536367|
|BOX OF VINTAGE JI...|   536367|
|BOX OF VINTAGE AL...|   536367|
|HOME BUILDING BLO...|   536367|
|LOVE BUILDING BLO...|   536367|
|RECIPE BOX WITH M...|   536367|
+--------------------+---------+
only showing top 20 rows



## Arrays

In [44]:
from pyspark.sql.functions import split

df.select(split(col("Description")," ")).show(2,False)

+----------------------------------------+
|split(Description,  , -1)               |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                 |
+----------------------------------------+
only showing top 2 rows



In [45]:
df.select(split(col("Description")," ").alias("array_col"))\
    .selectExpr("array_col[0]")\
    .show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



### Array Contains

In [46]:
from pyspark.sql.functions import array_contains

df.select(array_contains(split(col("Description")," "),"WHITE")).show()

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
|                                           false|
|                                           false|
|                                            true|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                                           false|
|                              

### Explode

In [47]:
from pyspark.sql.functions import split,explode

df.withColumn("splitted",split(col("Description")," "))\
    .withColumn("exploded",explode(col("splitted")))\
    .select("Description","InvoiceNo","exploded")\
    .show(5,False)

+----------------------------------+---------+--------+
|Description                       |InvoiceNo|exploded|
+----------------------------------+---------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |WHITE   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HANGING |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HEART   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |T-LIGHT |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HOLDER  |
+----------------------------------+---------+--------+
only showing top 5 rows



## Map

In [48]:
from pyspark.sql.functions import col, create_map

df_with_map = df.select(
    create_map(col("Description"), col("InvoiceNo")).alias("complex_map"),
    col("Description")
)

df_with_map.withColumn("value", col("complex_map")[col("Description")]).show(truncate=False)


+-----------------------------------------------+-----------------------------------+------+
|complex_map                                    |Description                        |value |
+-----------------------------------------------+-----------------------------------+------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365} |WHITE HANGING HEART T-LIGHT HOLDER |536365|
|{WHITE METAL LANTERN -> 536365}                |WHITE METAL LANTERN                |536365|
|{CREAM CUPID HEARTS COAT HANGER -> 536365}     |CREAM CUPID HEARTS COAT HANGER     |536365|
|{KNITTED UNION FLAG HOT WATER BOTTLE -> 536365}|KNITTED UNION FLAG HOT WATER BOTTLE|536365|
|{RED WOOLLY HOTTIE WHITE HEART. -> 536365}     |RED WOOLLY HOTTIE WHITE HEART.     |536365|
|{SET 7 BABUSHKA NESTING BOXES -> 536365}       |SET 7 BABUSHKA NESTING BOXES       |536365|
|{GLASS STAR FROSTED T-LIGHT HOLDER -> 536365}  |GLASS STAR FROSTED T-LIGHT HOLDER  |536365|
|{HAND WARMER UNION JACK -> 536366}             |HAND WARMER UNION JAC

## JSON

In [49]:
jsonDF=spark.range(1)\
    .selectExpr("""
    '{"myJSONKey" :{"myJSONValue" : [1,2,3]}}' as jsonString
    """)

In [50]:
jsonDF.show(2,False)

+----------------------------------------+
|jsonString                              |
+----------------------------------------+
|{"myJSONKey" :{"myJSONValue" : [1,2,3]}}|
+----------------------------------------+



In [51]:
from pyspark.sql.functions import col, get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue").alias("myJSONValue"),
    json_tuple(col("jsonString"), "myJSONKey").alias("myJSONKey")
).show(truncate=False)


+-----------+-----------------------+
|myJSONValue|myJSONKey              |
+-----------+-----------------------+
|[1,2,3]    |{"myJSONValue":[1,2,3]}|
+-----------+-----------------------+



In [52]:
#structType to JSON
from pyspark.sql.functions import to_json

df.selectExpr("(InvoiceNo,Description) as myStruct")\
    .select(to_json(col("myStruct"))).show(1,False)

+-------------------------------------------------------------------------+
|to_json(myStruct)                                                        |
+-------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"}|
+-------------------------------------------------------------------------+
only showing top 1 row



In [53]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

parseSchema = StructType((
    StructField("InvoiceNo",StringType(),True),
    StructField("Description",StringType(),True)))

# Convert Struct to JSON and Parse Back
df.selectExpr("struct(InvoiceNo, Description) as myStruct") \
    .select(to_json(col("myStruct")).alias("newJSON")) \
    .select(from_json(col("newJSON"), parseSchema).alias("parsed"), col("newJSON")) \
    .show(truncate=False)

+---------------------------------------------+--------------------------------------------------------------------------+
|parsed                                       |newJSON                                                                   |
+---------------------------------------------+--------------------------------------------------------------------------+
|{536365, WHITE HANGING HEART T-LIGHT HOLDER} |{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"} |
|{536365, WHITE METAL LANTERN}                |{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}                |
|{536365, CREAM CUPID HEARTS COAT HANGER}     |{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}     |
|{536365, KNITTED UNION FLAG HOT WATER BOTTLE}|{"InvoiceNo":"536365","Description":"KNITTED UNION FLAG HOT WATER BOTTLE"}|
|{536365, RED WOOLLY HOTTIE WHITE HEART.}     |{"InvoiceNo":"536365","Description":"RED WOOLLY HOTTIE WHITE HEART."}     |
|{536365, SET 7 

## User-Defined Functions

In [54]:
udfExampleDF = spark.range(5).toDF("num")

def power3(double_value):
    return double_value **3

power3(2.0)

8.0

In [55]:
from pyspark.sql.functions import udf

power3udf = udf(power3)

In [84]:
udfExampleDF.select(power3udf(col("num"))).show()

Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonFunction. Trace:
py4j.Py4JException: Constructor org.apache.spark.api.python.PythonFunction([class [B, class java.util.HashMap, class java.util.ArrayList, class java.lang.String, class java.lang.String, class java.util.ArrayList, class org.apache.spark.api.python.PythonAccumulatorV2]) does not exist
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:180)
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:197)
	at py4j.Gateway.invoke(Gateway.java:237)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)

