In [1]:
from pyspark.sql import SparkSession
spark = SparkSession\
  .builder\
  .appName('SGD_Chapter06')\
  .getOrCreate()

In [2]:
df = spark.read.format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('/home/jagadeesh/git/Spark-The-Definitive-Guide/data/retail-data/by-day/2010-12-01.csv')

df.printSchema()
df.createOrReplaceTempView('dfTable')

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [3]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [4]:
from pyspark.sql.functions import lit
df.select(lit(5), lit('five'), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [5]:
from pyspark.sql.functions import col
df.where(col('InvoiceNo') != 536365)\
  .select('InvoiceNo', 'Description')\
  .show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [6]:
df.where('InvoiceNo <> 536365').show(5, False)

+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+-------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00|2.1      |13047.0   |United Kingdom|
+---------+---------+-----------------------------+--------+----

In [7]:
from pyspark.sql.functions import instr
priceFilter = col('UnitPrice') > 600
descripFilter = instr(df.Description, 'POSTAGE') >= 1
df.where(df.StockCode.isin('DOT')).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [8]:
DOTCodeFilter = col('StockCode') == 'DOT'
priceFilter = col('UnitPrice') > 600
descripFilter = instr(col('Description'), 'POSTAGE') >= 1
df.withColumn('isExpensive', DOTCodeFilter & (priceFilter | descripFilter))\
  .where('isExpensive')\
  .select('unitPrice', 'isExpensive').show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [9]:
from pyspark.sql.functions import expr
df.withColumn('isExpensive', expr('NOT UnitPrice <= 250'))\
  .filter('isExpensive')\
  .select('Description', 'UnitPrice').show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



In [10]:
df.withColumn('isExpensive', expr('NOT UnitPrice <= 250'))\
  .where('isExpensive')\
  .select('Description', 'UnitPrice').show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



In [11]:
from pyspark.sql.functions import pow, expr, col
fabricatedQuantity = pow(col('Quantity') * col('UnitPrice'), 2) + 5
df.select(expr('CustomerId'), fabricatedQuantity.alias('realQuantity')).show(5)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 5 rows



In [12]:
df.selectExpr('CustomerId', 'POWER((Quantity * UnitPrice), 2) + 5 as realQuantity').show(5)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 5 rows



In [13]:
from pyspark.sql.functions import lit, round, bround
df.select(round(lit('2.5')), bround(lit('2.5'))).show(5)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
|          3.0|           2.0|
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 5 rows



In [14]:
from pyspark.sql.types import FloatType

In [15]:
df.select(round(lit('3.2')), bround(lit('2.0'))).count()

3108

In [16]:
df.count()

3108

In [17]:
df.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [18]:
df.select(lit('3.2')).show(5)

+---+
|3.2|
+---+
|3.2|
|3.2|
|3.2|
|3.2|
|3.2|
+---+
only showing top 5 rows



In [19]:
from pyspark.sql.functions import corr
df.stat.corr('Quantity', 'UnitPrice')

-0.04112314436835551

In [20]:
df.select(corr('Quantity', 'UnitPrice')).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [21]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [22]:
colName = 'Quantity'
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile('UnitPrice', quantileProbs, relError)

[2.51]

In [23]:
from pyspark.sql.functions import count, mean, stddev_pop, min, max

In [24]:
df.stat.crosstab('StockCode', 'Quantity').show(5)

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [25]:
df.stat.freqItems(['StockCode', 'Quantity']).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



In [26]:
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id().alias('Number'), 'InvoiceNo').show(3)

+------+---------+
|Number|InvoiceNo|
+------+---------+
|     0|   536365|
|     1|   536365|
|     2|   536365|
+------+---------+
only showing top 3 rows



In [27]:
import numpy as np
import h5py

In [28]:
data = np.random.rand(50)
with h5py.File('random.h5', 'w') as f:
    f.create_dataset('default', data=data)

In [29]:
sc = spark.sparkContext

In [30]:
with h5py.File('random.h5', 'r') as f:
    data = f['default']
    data_rdd = sc.parallelize((data))

In [31]:
from pyspark.sql import Row
df1 = sc.parallelize(Row(r = (Row([1.0, 2.0, 3.5])))).toDF()
df1.show()

+---------------+
|             _1|
+---------------+
|[1.0, 2.0, 3.5]|
+---------------+



In [None]:
import pyspark.sql.functions as F
def f(x):
    return F.lit(x)
data_spark = data_rdd.foreach(f).toDF()

In [32]:
from pyspark.sql.functions import initcap
df.select(initcap(col('Description'))).show(5)

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
+--------------------+
only showing top 5 rows



In [33]:
from pyspark.sql.functions import lower, upper
df.select(col('Description'),
    lower(col("Description")).alias('lower'),
    upper(lower(col('Description'))).alias('upper')).show(5)

+--------------------+--------------------+--------------------+
|         Description|               lower|               upper|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
|CREAM CUPID HEART...|cream cupid heart...|CREAM CUPID HEART...|
|KNITTED UNION FLA...|knitted union fla...|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|red woolly hottie...|RED WOOLLY HOTTIE...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [34]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("  Hello  ")).alias('ltrim'),
    rtrim(lit("  Hello  ")).alias('rtrim'),
    trim(lit("   Hello  ")).alias('trim'),
    lpad(lit("Hello"),6, " ").alias('lp'),
    rpad(lit("Hello"), 10, " ").alias('rp')).show(5)

+-------+-------+-----+------+----------+
|  ltrim|  rtrim| trim|    lp|        rp|
+-------+-------+-----+------+----------+
|Hello  |  Hello|Hello| Hello|Hello     |
|Hello  |  Hello|Hello| Hello|Hello     |
|Hello  |  Hello|Hello| Hello|Hello     |
|Hello  |  Hello|Hello| Hello|Hello     |
|Hello  |  Hello|Hello| Hello|Hello     |
+-------+-------+-----+------+----------+
only showing top 5 rows



In [35]:
#spark uses java regular expressions functioimport pyspark.sql.functions as ns regexp_extract, regexp_replace, translate
from pyspark.sql.functions import regexp_replace
regexp_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regexp_string, "COLOR").alias("color_clean"),
  col("Description")).show(5)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
|CREAM CUPID HEART...|CREAM CUPID HEART...|
|KNITTED UNION FLA...|KNITTED UNION FLA...|
|COLOR WOOLLY HOTT...|RED WOOLLY HOTTIE...|
+--------------------+--------------------+
only showing top 5 rows



In [36]:
# translate: replace given characters
from pyspark.sql.functions import translate
df.select(translate(col("Description"), 'LEET', "1437"), col("Description")).show(5)

+----------------------------------+--------------------+
|translate(Description, LEET, 1437)|         Description|
+----------------------------------+--------------------+
|              WHI74 HANGING H4A...|WHITE HANGING HEA...|
|               WHI74 M47A1 1AN74RN| WHITE METAL LANTERN|
|              CR4AM CUPID H4AR7...|CREAM CUPID HEART...|
|              KNI774D UNION F1A...|KNITTED UNION FLA...|
|              R4D WOO11Y HO77I4...|RED WOOLLY HOTTIE...|
+----------------------------------+--------------------+
only showing top 5 rows



In [37]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
   regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
   col("Description")).show(5)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
|           |CREAM CUPID HEART...|
|           |KNITTED UNION FLA...|
|        RED|RED WOOLLY HOTTIE...|
+-----------+--------------------+
only showing top 5 rows



In [38]:
#instr: returns first occurance of index of the sub-string in a column. Index starts from one
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >=1
containsWhite = instr(col("Description"), "WHITE") >=1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
  .where("hasSimpleColor")\
  .select("Description").show(20, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|WOOD 2 DRAWER CABINET WHITE FINISH|
|WOOD S/3 CABINET ANT WHITE FINISH |
|WOODEN PICTURE FRAME WHITE FINISH |
|WOODEN FRAME ANTIQUE WHITE        |
|RED WOOLLY HOTTIE WHITE HEART.    |
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|WOOD 2 DRAWER CABINET WHITE FINISH|
|WOOD S/3 CABINET ANT WHITE FINISH |
|WOODEN PICTURE FRAME WHITE FINISH |
|WOODEN FRAME ANTIQUE WHITE        |
|RED WOOLLY HOTTIE WHITE HEART.    |
|JUMBO  BAG BAROQUE BLACK WHITE    |
|WHITE SPOT RED CERAMIC DRAWER KNOB|
|FELT EGG COSY WHITE RABBIT        |
+----------------------------------+
only showing top 20 rows



In [84]:
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
             .cast("boolean")\
             .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to be column type
df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(5, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
|HAND WARMER RED POLKA DOT         |
|RED COAT RACK PARIS FASHION       |
+----------------------------------+
only showing top 5 rows



In [86]:
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
             .cast("boolean")\
             .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns

[Column<b'CAST(locate(BLACK, Description, 1) AS BOOLEAN) AS `is_black`'>,
 Column<b'CAST(locate(WHITE, Description, 1) AS BOOLEAN) AS `is_white`'>,
 Column<b'CAST(locate(RED, Description, 1) AS BOOLEAN) AS `is_red`'>,
 Column<b'CAST(locate(GREEN, Description, 1) AS BOOLEAN) AS `is_green`'>,
 Column<b'CAST(locate(BLUE, Description, 1) AS BOOLEAN) AS `is_blue`'>]

In [165]:
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
             .cast("boolean")\
             .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(5, False)

AnalysisException: "cannot resolve '`Description`' given input columns: [is_red, is_black, is_blue, is_green, is_white];;\n'Project ['Description]\n+- Filter (is_white#1945 || is_red#1946)\n   +- Project [cast(locate(BLACK, Description#12, 1) as boolean) AS is_black#1944, cast(locate(WHITE, Description#12, 1) as boolean) AS is_white#1945, cast(locate(RED, Description#12, 1) as boolean) AS is_red#1946, cast(locate(GREEN, Description#12, 1) as boolean) AS is_green#1947, cast(locate(BLUE, Description#12, 1) as boolean) AS is_blue#1948]\n      +- Relation[InvoiceNo#10,StockCode#11,Description#12,Quantity#13,InvoiceDate#14,UnitPrice#15,CustomerID#16,Country#17] csv\n"

In [41]:
# Dates and Timestamps conform to java dates and timestamps standards
from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [43]:
dateDF.show(10, False)

+---+----------+-----------------------+
|id |today     |now                    |
+---+----------+-----------------------+
|0  |2019-10-08|2019-10-08 12:33:19.653|
|1  |2019-10-08|2019-10-08 12:33:19.653|
|2  |2019-10-08|2019-10-08 12:33:19.653|
|3  |2019-10-08|2019-10-08 12:33:19.653|
|4  |2019-10-08|2019-10-08 12:33:19.653|
|5  |2019-10-08|2019-10-08 12:33:19.653|
|6  |2019-10-08|2019-10-08 12:33:19.653|
|7  |2019-10-08|2019-10-08 12:33:19.653|
|8  |2019-10-08|2019-10-08 12:33:19.653|
|9  |2019-10-08|2019-10-08 12:33:19.653|
+---+----------+-----------------------+



In [45]:
from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show()

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
|        2019-10-03|        2019-10-13|
+------------------+------------------+



In [46]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show()

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
+-------------------------+



In [48]:
dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [51]:
# java format is year-month-date.If input is in the format, spark will not thorw  error, rather outputs 'null' values
spark.range(5).withColumn("date", lit("2017-01-01"))\
   .select(to_date(col("date"))).show()

+---------------+
|to_date(`date`)|
+---------------+
|           null|
|           null|
|           null|
|           null|
|           null|
+---------------+



In [55]:
from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
  to_date(lit("2017-12-11"), dateFormat).alias("date"),
  to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dataTable2")

In [56]:
cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [57]:
from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

+----------------------------------+
|to_timestamp(`date`, 'yyyy-dd-MM')|
+----------------------------------+
|               2017-11-12 00:00:00|
+----------------------------------+



In [58]:
cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [59]:
cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()  # yyyy-MM-dd

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [63]:
# coalesce funtion: first non-null value from a set of columns
from pyspark.sql.functions import coalesce
df.select(coalesce(col("CustomerId"), col("Description"))).show(10, False)

+---------------------------------+
|coalesce(CustomerId, Description)|
+---------------------------------+
|17850.0                          |
|17850.0                          |
|17850.0                          |
|17850.0                          |
|17850.0                          |
|17850.0                          |
|17850.0                          |
|17850.0                          |
|17850.0                          |
|13047.0                          |
+---------------------------------+
only showing top 10 rows



In [65]:
#  drop: which removes rows that contain nulls. The default is to drop any row in which any value is null
df.na.drop("any").count()

1968

In [66]:
# drop a row, if all values are null or Nan
df.na.drop("all").count() 

3108

In [67]:
df.count()

3108

In [70]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"]).count()

3108

In [None]:
df.na.fill("All Null values becom this string") # For string operations
df.na.fill(5:Integer) # For numeric operations 

In [71]:
df.na.fill(5:Double)

SyntaxError: invalid syntax (<ipython-input-71-e0b3adf34fee>, line 1)

In [72]:
df.na.fill("all", subset=["StockCode", "InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [73]:
fill_cols_values = {"StockCode": 5, "Description": "No Value"}
df.na.fill(fill_cols_values)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [76]:
df1 = df.na.replace([""], ["UNKNOWN"], "Description")

In [77]:
df1

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [94]:
df1.select(locate("UNKNOWN", "Description").cast("boolean").alias("unknown")).where(expr("unknown==1")).show()

+-------+
|unknown|
+-------+
+-------+



In [108]:
# working with complex Types
from pyspark.sql.functions import struct
# complexDF = df.selectExpr(struct("Description", "InvoiceNo").alias("complex"))
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"), expr("*"))
complexDF.createOrReplaceTempView("complexDF")

In [109]:
complexDF.schema

StructType(List(StructField(complex,StructType(List(StructField(Description,StringType,true),StructField(InvoiceNo,StringType,true))),false),StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,TimestampType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,DoubleType,true),StructField(Country,StringType,true)))

In [110]:
complexDF.printSchema()

root
 |-- complex: struct (nullable = false)
 |    |-- Description: string (nullable = true)
 |    |-- InvoiceNo: string (nullable = true)
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [112]:
complexDF.select("complex.Description").show(4)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
+--------------------+
only showing top 4 rows



In [114]:
complexDF.select(col("complex").getField("Description")).show(4)

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
+--------------------+
only showing top 4 rows



In [115]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(3)

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
| [CREAM, CUPID, HE...|
+---------------------+
only showing top 3 rows



In [116]:
df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(3)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
|       CREAM|
+------------+
only showing top 3 rows



In [117]:
from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(3)

+---------------------------+
|size(split(Description,  ))|
+---------------------------+
|                          5|
|                          3|
|                          5|
+---------------------------+
only showing top 3 rows



In [118]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(3)

+--------------------------------------------+
|array_contains(split(Description,  ), WHITE)|
+--------------------------------------------+
|                                        true|
|                                        true|
|                                       false|
+--------------------------------------------+
only showing top 3 rows



In [121]:
df.select(col("Description")).where(expr("CustomerId == 17850")).count()

84

In [123]:
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded", "splitted").show(5)

+--------------------+---------+--------+--------------------+
|         Description|InvoiceNo|exploded|            splitted|
+--------------------+---------+--------+--------------------+
|WHITE HANGING HEA...|   536365|   WHITE|[WHITE, HANGING, ...|
|WHITE HANGING HEA...|   536365| HANGING|[WHITE, HANGING, ...|
|WHITE HANGING HEA...|   536365|   HEART|[WHITE, HANGING, ...|
|WHITE HANGING HEA...|   536365| T-LIGHT|[WHITE, HANGING, ...|
|WHITE HANGING HEA...|   536365|  HOLDER|[WHITE, HANGING, ...|
+--------------------+---------+--------+--------------------+
only showing top 5 rows



In [128]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .show(3, False)

+----------------------------------------------+
|complex_map                                   |
+----------------------------------------------+
|[WHITE HANGING HEART T-LIGHT HOLDER -> 536365]|
|[WHITE METAL LANTERN -> 536365]               |
|[CREAM CUPID HEARTS COAT HANGER -> 536365]    |
+----------------------------------------------+
only showing top 3 rows



In [132]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(5)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
|                            null|
|                            null|
|                            null|
+--------------------------------+
only showing top 5 rows



In [133]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("create_map"))\
  .selectExpr("explode(create_map)").show(3)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
|CREAM CUPID HEART...|536365|
+--------------------+------+
only showing top 3 rows



In [134]:
jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString """)

In [143]:
# use json_tuple, if json_object has only one level of nesting
from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(
  get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[2]").alias("column"),
  json_tuple(col("jsonString"), "myJSONKey")).show(3, False)

+------+-----------------------+
|column|c0                     |
+------+-----------------------+
|3     |{"myJSONValue":[1,2,3]}|
+------+-----------------------+



In [142]:
jsonDF.show(2, False)

+-------------------------------------------+
|jsonString                                 |
+-------------------------------------------+
|{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}|
+-------------------------------------------+



In [146]:
from pyspark.sql.functions import to_json, from_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct"))).show(5, False)

+--------------------------------------------------------------------------+
|structstojson(myStruct)                                                   |
+--------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"} |
|{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}                |
|{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}     |
|{"InvoiceNo":"536365","Description":"KNITTED UNION FLAG HOT WATER BOTTLE"}|
|{"InvoiceNo":"536365","Description":"RED WOOLLY HOTTIE WHITE HEART."}     |
+--------------------------------------------------------------------------+
only showing top 5 rows



In [160]:
from pyspark.sql.types import StringType, StructType, StructField
parseSchema = StructType((
  StructField("InvoiceNo", StringType(), True),
  StructField("Description", StringType(), True)))

df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")).alias("newJson"))\
  .select(from_json(col("newJson"),parseSchema), col("newJson")).show(2, False)

+--------------------------------------------+-------------------------------------------------------------------------+
|jsontostructs(newJson)                      |newJson                                                                  |
+--------------------------------------------+-------------------------------------------------------------------------+
|[536365, WHITE HANGING HEART T-LIGHT HOLDER]|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"}|
|[536365, WHITE METAL LANTERN]               |{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}               |
+--------------------------------------------+-------------------------------------------------------------------------+
only showing top 2 rows



In [182]:
from pyspark.sql.functions import udf
udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
    return float((double_value ** 3))


In [183]:
udfExampleDF.select(power3(col("num"))).show()

TypeError: float() argument must be a string or a number, not 'Column'

In [185]:
power3udf = udf(power3)
udfExampleDF.select(power3udf(col("num"))).show()

+-----------+
|power3(num)|
+-----------+
|        0.0|
|        1.0|
|        8.0|
|       27.0|
|       64.0|
+-----------+



In [187]:
# if specified type does not align with the actual type returned by the function, Spark will not throw an error but will just return
# null to designate a failure.
from pyspark.sql.types import DoubleType, IntegerType
spark.udf.register("power3py", power3, DoubleType())
udfExampleDF.selectExpr("power3py(num)").show()

+-------------+
|power3py(num)|
+-------------+
|          0.0|
|          1.0|
|          8.0|
|         27.0|
|         64.0|
+-------------+

