In [0]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("dbfs:/FileStore/by_day_retail_data/2010_12_01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
df.sample(fraction=0.005,seed=7).show(5,False)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536408   |22114    |HOT WATER BOTTLE TEA AND SYMPATHY  |4       |2010-12-01 11:41:00|3.95     |14307.0   |United Kingdom|
|536488   |22909    |SET OF 20 VINTAGE CHRISTMAS NAPKINS|1       |2010-12-01 12:31:00|0.85     |17897.0   |United Kingdom|
|536537   |22800    |ANTIQUE TALL SWIRLGLASS TRINKET POT|8       |2010-12-01 13:51:00|3.75     |15922.0   |United Kingdom|
|536539   |37495    |FAIRY CAKE BIRTHDAY CANDLE SET     |4       |2010-12-01 14:03:00|3.75     |15165.0   |United Kingdom|
|536542   |22379    |RECYCLING BAG RETROSPOT            |20      |2010-12-01 14:11:00|2.1      |16456.0   |United Kingdom|
+---------+-----

In [0]:
#lit function converts type in another language to its correspnding Spark representation.
from pyspark.sql.functions import lit
df.select(lit(5)).show(2)

+---+
|  5|
+---+
|  5|
|  5|
+---+
only showing top 2 rows



In [0]:
df.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo", "Description")\
.show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [0]:
df.where("InvoiceNo != 536365").show(5, False)

+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
+---------+---------+-----------------------------+--------+----

In [0]:
from pyspark.sql.functions import instr,col
priceFilter = col('UnitPrice')>600
desFilter = instr(df.Description, "POSTAGE" ) >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | desFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [0]:
%sql
SELECT * FROM dfTable WHERE StockCode in ("DOT") AND 
(UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
536544,DOT,DOTCOM POSTAGE,1,2010-12-01T14:32:00.000+0000,569.77,,United Kingdom
536592,DOT,DOTCOM POSTAGE,1,2010-12-01T17:06:00.000+0000,607.49,,United Kingdom


In [0]:
dotFilter = col("StockCode") == 'DOT'
priceFilter = col("UnitPrice") > 600
desFilter = instr(col("Description"),"POSTAGE")>=1
df.withColumn('isExpensive', dotFilter & (priceFilter | desFilter)).filter("isExpensive").show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+



In [0]:
%sql
select *, (StockCode = 'DOT' AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)) as inExpensive
from dfTable
Where (StockCode = 'DOT' AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1))

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,inExpensive
536544,DOT,DOTCOM POSTAGE,1,2010-12-01T14:32:00.000+0000,569.77,,United Kingdom,True
536592,DOT,DOTCOM POSTAGE,1,2010-12-01T17:06:00.000+0000,607.49,,United Kingdom,True


In [0]:
from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
.where("isExpensive")\
.select("Description", "UnitPrice").show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



In [0]:
df.where(col("Description").eqNullSafe("hello")).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [0]:
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity")*col("UnitPrice"),2) + 5
df.select("CustomerID", fabricatedQuantity.alias("originalQuantity")).show(4)

+----------+------------------+
|CustomerID|  originalQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 4 rows



In [0]:
df.selectExpr("CustomerId", "(power(Quantity*UnitPrice,2)+5) as originalQuantity").show(2)

+----------+------------------+
|CustomerId|  originalQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [0]:
%sql
Select CustomerId, (power(Quantity*UnitPrice,2)+5) as originalQuantity
from dfTable LIMIT 2

CustomerId,originalQuantity
17850.0,239.09
17850.0,418.7156


In [0]:
from pyspark.sql.functions import lit, round,bround
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [0]:
%sql
 SELECT corr(Quantity,UnitPrice) FROM dfTable

"corr(Quantity, UnitPrice)"
-0.0411231443683555


In [0]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [0]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

Out[20]: [2.51]

In [0]:
df.stat.crosstab("StockCode", "Quantity").show(1)

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [0]:
df.stat.freqItems(["StockCode", "Quantity", "Description"]).show(1,False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
df.select("*",monotonically_increasing_id()).show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|monotonically_increasing_id()|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|                            0|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|                            1|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show(2,False)

+----------------------------------+
|initcap(Description)              |
+----------------------------------+
|White Hanging Heart T-light Holder|
|White Metal Lantern               |
+----------------------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import lower, upper
df.select(col("Description"),
lower(col("Description")),
upper(col("Description"))).show(2)

+--------------------+--------------------+--------------------+
|         Description|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 15, " ").alias("rp")).show(2)

+------+------+-----+---+---------------+
| ltrim| rtrim| trim| lp|             rp|
+------+------+-----+---+---------------+
|HELLO | HELLO|HELLO|HEL|HELLO          |
|HELLO | HELLO|HELLO|HEL|HELLO          |
+------+------+-----+---+---------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import regexp_replace
reg = 'BLACK|WHITE|RED|GREEN|BLUE|VIOLET'
df.select(regexp_replace(col('Description'),reg,"COLOR").alias("remove_color"), col('Description')).show(5)

+--------------------+--------------------+
|        remove_color|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
|CREAM CUPID HEART...|CREAM CUPID HEART...|
|KNITTED UNION FLA...|KNITTED UNION FLA...|
|COLOR WOOLLY HOTT...|RED WOOLLY HOTTIE...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
.show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
col("Description")).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor")\
.select("Description","hasSimpleColor").show(3, False)

+----------------------------------+--------------+
|Description                       |hasSimpleColor|
+----------------------------------+--------------+
|WHITE HANGING HEART T-LIGHT HOLDER|true          |
|WHITE METAL LANTERN               |true          |
|RED WOOLLY HOTTIE WHITE HEART.    |true          |
+----------------------------------+--------------+
only showing top 3 rows



In [0]:
from pyspark.sql.functions import locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color):
    return locate(color.upper(), column).cast("boolean").alias("is_"+color)


In [0]:
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*"))

In [0]:
df.select(*selectedColumns).where(expr("is_white OR is_red")).show(3, False)

+--------+--------+------+--------+-------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|is_black|is_white|is_red|is_green|is_blue|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+--------+--------+------+--------+-------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|false   |true    |false |false   |false  |536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|false   |true    |false |false   |false  |536365   |71053    |WHITE METAL LANTERN               |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|false   |true    |true  |false   |false  |536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.    |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+--------+------

In [0]:
from pyspark.sql.functions import locate
df.selectExpr("*","locate('WHITE', 'Description', 1)").show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|locate(WHITE, Description, 1)|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|                            0|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|                            0|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|                            0|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------------------------+
only showing top 3 rows



In [0]:
from pyspark.sql.functions import current_date, current_timestamp
dateDf = spark.range(10).withColumn("today",current_date()).withColumn('current_time', current_timestamp())

In [0]:
dateDf.show(5,False)

+---+----------+----------------------+
|id |today     |current_time          |
+---+----------+----------------------+
|0  |2023-01-06|2023-01-06 06:23:10.34|
|1  |2023-01-06|2023-01-06 06:23:10.34|
|2  |2023-01-06|2023-01-06 06:23:10.34|
|3  |2023-01-06|2023-01-06 06:23:10.34|
|4  |2023-01-06|2023-01-06 06:23:10.34|
+---+----------+----------------------+
only showing top 5 rows



In [0]:
dateDf.createOrReplaceTempView("dateTable")

In [0]:
from pyspark.sql.functions import date_add, date_sub
dateDf.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-01-01|        2023-01-11|
+------------------+------------------+
only showing top 1 row



In [0]:
%sql
SELECT date_sub(today, 5), date_add(today, 5) FROM dateTable LIMIT 1

"date_sub(today, 5)","date_add(today, 5)"
2023-01-01,2023-01-11


In [0]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDf.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row



In [0]:
dateDf.select(to_date(lit("1997-06-08")).alias('dob'),"today")\
.select(months_between(col("dob"),col("today"))).show(2)

+--------------------------------+
|months_between(dob, today, true)|
+--------------------------------+
|                   -306.93548387|
|                   -306.93548387|
+--------------------------------+
only showing top 2 rows



In [0]:
%sql
SELECT to_date('2016-01-01'), months_between('2016-01-01', '2017-01-01'),
datediff('2016-01-01', '2017-01-01')
FROM dateTable LIMIT 1

to_date(2016-01-01),"months_between(2016-01-01, 2017-01-01, true)","datediff(2016-01-01, 2017-01-01)"
2016-01-01,-12.0,-366


In [0]:
dateFormat = 'yyyy-dd-MM'
cleanDate = dateDf.select(to_date(lit("2016-20-12"),dateFormat).alias('date1'),to_date(lit("2017-12-11"),dateFormat).alias('date3'))
cleanDate.show(1)

+----------+----------+
|     date1|     date3|
+----------+----------+
|2016-12-20|2017-11-12|
+----------+----------+
only showing top 1 row



In [0]:
from pyspark.sql.functions import to_timestamp
cleanDate.select(to_timestamp(col("date1"), dateFormat)).show(1)

+-------------------------------+
|to_timestamp(date1, yyyy-dd-MM)|
+-------------------------------+
|            2016-12-20 00:00:00|
+-------------------------------+
only showing top 1 row



In [0]:
%sql
SELECT cast(to_date("2017-01-01", "yyyy-dd-MM") as timestamp)

"CAST(to_date(2017-01-01, yyyy-dd-MM) AS TIMESTAMP)"
2017-01-01T00:00:00.000+0000


In [0]:
cleanDate.filter(col("date3") < lit("2017-12-13")).show(1)

+----------+----------+
|     date1|     date3|
+----------+----------+
|2016-12-20|2017-11-12|
+----------+----------+
only showing top 1 row



In [0]:
from pyspark.sql.functions import coalesce
df.select("*",coalesce(col("Description"), col("CustomerId"))).where(col("CustomerID").isNull()).show(2,False)

+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+---------------------------------+
|InvoiceNo|StockCode|Description                    |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |coalesce(Description, CustomerId)|
+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+---------------------------------+
|536414   |22139    |null                           |56      |2010-12-01 11:52:00|0.0      |null      |United Kingdom|null                             |
|536544   |21773    |DECORATIVE ROSE BATHROOM BOTTLE|1       |2010-12-01 14:32:00|2.51     |null      |United Kingdom|DECORATIVE ROSE BATHROOM BOTTLE  |
+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+---------------------------------+
only showing top 2 rows



In [0]:
df.where("CustomerID is NULL").show(1)

+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536414|    22139|       null|      56|2010-12-01 11:52:00|      0.0|      null|United Kingdom|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
only showing top 1 row



In [0]:
cDf = spark.createDataFrame([(None, None), (1, 
                                            None), (None, 2),(4,3)], ("a", "b"))
cDf.select(coalesce(cDf["a"], cDf["b"])).show()

+--------------+
|coalesce(a, b)|
+--------------+
|          null|
|             1|
|             2|
|             4|
+--------------+



In [0]:
%sql
SELECT
ifnull(null, 'return_value'),
nullif('value', 'value'),
nvl(null, 'return_value'),
nvl2('not_null', 'return_value', "else_value")
FROM dfTable LIMIT 3

"ifnull(NULL, return_value)","nullif(value, value)","nvl(NULL, return_value)","nvl2(not_null, return_value, else_value)"
return_value,,return_value,return_value
return_value,,return_value,return_value
return_value,,return_value,return_value


In [0]:
df.na.drop('any').count()

Out[51]: 1968

In [0]:
df.na.drop("all").count()

Out[52]: 3108

In [0]:
df.na.drop("all", subset=["Description", "CustomerID"]).count()

Out[53]: 3098

In [0]:
#count null cols
from pyspark.sql.functions import count,when
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|        0|        0|         10|       0|          0|        0|      1140|      0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [0]:
selectedCol = [count(col(c).isNull()).alias(c+'_is_null') for c in df.columns]
#null_col = [c+'_is_null' for c in df.columns]
#cond = " or ".join(null_col)
df.select(selectedCol).show()#.where(cond).count()

+-----------------+-----------------+-------------------+----------------+-------------------+-----------------+------------------+---------------+
|InvoiceNo_is_null|StockCode_is_null|Description_is_null|Quantity_is_null|InvoiceDate_is_null|UnitPrice_is_null|CustomerID_is_null|Country_is_null|
+-----------------+-----------------+-------------------+----------------+-------------------+-----------------+------------------+---------------+
|             3108|             3108|               3108|            3108|               3108|             3108|              3108|           3108|
+-----------------+-----------------+-------------------+----------------+-------------------+-----------------+------------------+---------------+



In [0]:
'''fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)

df.na.fill(5:Integer)
df.na.fill(5:Double)
df.na.fill('hello') #fills null value of columns containing String
df.na.fill("all", subset=["StockCode", "InvoiceNo"])

df.na.replace([""], ["UNKNOWN"], "Description")'''

Out[56]: 'fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}\ndf.na.fill(fill_cols_vals)\n\ndf.na.fill(5:Integer)\ndf.na.fill(5:Double)\ndf.na.fill(\'hello\') #fills null value of columns containing String\ndf.na.fill("all", subset=["StockCode", "InvoiceNo"])\n\ndf.na.replace([""], ["UNKNOWN"], "Description")'

In [0]:
df.selectExpr("(Description, InvoiceNo) as complex", "*").show(2,False)
df.selectExpr("struct(Description, InvoiceNo) as complex", "*").show(2,False)

+--------------------------------------------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|complex                                     |InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+--------------------------------------------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365}|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|{WHITE METAL LANTERN, 536365}               |536365   |71053    |WHITE METAL LANTERN               |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+--------------------------------------------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------

In [0]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [0]:
complexDF.select("complex.Description").show(2,False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 2 rows



In [0]:
complexDF.select(col("complex").getField('InvoiceNo')).show(2,False)

+-----------------+
|complex.InvoiceNo|
+-----------------+
|536365           |
|536365           |
+-----------------+
only showing top 2 rows



In [0]:
complexDF.select("complex.*").show(2,False)

+----------------------------------+---------+
|Description                       |InvoiceNo|
+----------------------------------+---------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |
|WHITE METAL LANTERN               |536365   |
+----------------------------------+---------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import split
df.select("*",split(col("Description"), " ")).show(1,False)

+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+----------------------------------------+
|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |split(Description,  , -1)               |
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+----------------------------------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+----------------------------------------+
only showing top 1 row



In [0]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [0]:
%sql
SELECT split(Description, ' ')[0] FROM dfTable

"split(Description, , -1)[0]"
WHITE
WHITE
CREAM
KNITTED
RED
SET
GLASS
HAND
HAND
ASSORTED


In [0]:
from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2)

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import explode
df.withColumn('splittedCol',split(col("Description")," "))\
.withColumn('explodedCol',explode('splittedCol')).show(10,False)

+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+----------------------------------------+-----------+
|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |splittedCol                             |explodedCol|
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+----------------------------------------+-----------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|WHITE      |
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HANGING    |
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|[WHI

In [0]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.show(2,False)

+----------------------------------------------+
|complex_map                                   |
+----------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365}|
|{WHITE METAL LANTERN -> 536365}               |
+----------------------------------------------+
only showing top 2 rows



In [0]:
%sql
SELECT map(Description, InvoiceNo) as complex_map FROM dfTable
WHERE Description IS NOT NULL LIMIT 2

complex_map
Map(WHITE HANGING HEART T-LIGHT HOLDER -> 536365)
Map(WHITE METAL LANTERN -> 536365)


In [0]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



In [0]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("*","explode(complex_map)").show(10,False)

+-----------------------------------------------+-----------------------------------+------+
|complex_map                                    |key                                |value |
+-----------------------------------------------+-----------------------------------+------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365} |WHITE HANGING HEART T-LIGHT HOLDER |536365|
|{WHITE METAL LANTERN -> 536365}                |WHITE METAL LANTERN                |536365|
|{CREAM CUPID HEARTS COAT HANGER -> 536365}     |CREAM CUPID HEARTS COAT HANGER     |536365|
|{KNITTED UNION FLAG HOT WATER BOTTLE -> 536365}|KNITTED UNION FLAG HOT WATER BOTTLE|536365|
|{RED WOOLLY HOTTIE WHITE HEART. -> 536365}     |RED WOOLLY HOTTIE WHITE HEART.     |536365|
|{SET 7 BABUSHKA NESTING BOXES -> 536365}       |SET 7 BABUSHKA NESTING BOXES       |536365|
|{GLASS STAR FROSTED T-LIGHT HOLDER -> 536365}  |GLASS STAR FROSTED T-LIGHT HOLDER  |536365|
|{HAND WARMER UNION JACK -> 536366}             |HAND WARMER UNION JAC

In [0]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")
jsonDF.show(1,False)

+-------------------------------------------+
|jsonString                                 |
+-------------------------------------------+
|{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}|
+-------------------------------------------+



In [0]:
from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
json_tuple(col("jsonString"), "myJSONKey")).show(2)

[0;36m  File [0;32m"<command-2224162944859908>"[0;36m, line [0;32m3[0m
[0;31m    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",[0m
[0m                                                                     ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax


In [0]:
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct"))).show(3,False)

+-------------------------------------------------------------------------+
|to_json(myStruct)                                                        |
+-------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"}|
|{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}               |
|{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}    |
+-------------------------------------------------------------------------+
only showing top 3 rows



In [0]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2,False)

+--------------------------------------------+-------------------------------------------------------------------------+
|from_json(newJSON)                          |newJSON                                                                  |
+--------------------------------------------+-------------------------------------------------------------------------+
|{536365, WHITE HANGING HEART T-LIGHT HOLDER}|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"}|
|{536365, WHITE METAL LANTERN}               |{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}               |
+--------------------------------------------+-------------------------------------------------------------------------+
only showing top 2 rows



In [0]:
udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
    return double_value ** 3
power3(2.0)

Out[104]: 8.0

In [0]:
from pyspark.sql.functions import udf
power3udf = udf(power3)

In [0]:
udfExampleDF.select("*",power3udf(col("num"))).show(5)

+---+-----------+
|num|power3(num)|
+---+-----------+
|  0|          0|
|  1|          1|
|  2|          8|
|  3|         27|
|  4|         64|
+---+-----------+



In [0]:
spark.udf.register("power3", power3)

Out[111]: <function __main__.power3(double_value)>

In [0]:
udfExampleDF.selectExpr("*","power3(num)").show(5)

+---+-----------+
|num|power3(num)|
+---+-----------+
|  0|          0|
|  1|          1|
|  2|          8|
|  3|         27|
|  4|         64|
+---+-----------+



In [0]:
from pyspark.sql.types import IntegerType, DoubleType
spark.udf.register("power3py", power3, IntegerType())
# in Python
udfExampleDF.selectExpr("power3py(num)").show(2)

+-------------+
|power3py(num)|
+-------------+
|            0|
|            1|
+-------------+
only showing top 2 rows

