In [79]:
from pyspark.sql import SparkSession

In [80]:
spark = SparkSession.builder.appName("Spark_Interview_Questions").getOrCreate()

### 1. Read Parquet file with Duplicate records, dedup and write back to HDFS.

In [27]:
orders_df = spark.read.parquet("/user/itv012857/data/orders_parquet")

In [28]:
orders_df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [29]:
orders_df.show()

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
|       7130|2013-07-25 00:00:...|       6|       COMPLETE|
|       4530|2013-07-25 00:00:...|       7|       COMPLETE|
|       2911|2013-07-25 00:00:...|       8|     PROCESSING|
|       5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|       5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|        918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|       1837|2013-07-25 00:00:...|      12|         CLOSED|
|       9149|2013-07-25 00:00:...|      13|PENDING_PAYMENT|
|       9842|2013-07-25 00:00:...|      

In [30]:
orders_df.count()

178715

In [31]:
#distinct_orders_df = orders_df.distinct()  -- This also works

distinct_orders_df = orders_df.dropDuplicates()

In [32]:
distinct_orders_df.count()

68883

In [33]:
distinct_orders_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv012857/data/distinct_orders_parquet") \
.save()

### 2. Pivot the input data, granlarity - col1 and col2
Schema : col1 string, col2 string, col3 int

Output required:
            
    col1,col2,col3
     a,aa,[1,2]
     b,bb,[5,4,3] 

In [4]:
input_data = [("a","aa",1),
              ("a","aa",2),
              ("b","bb",5),
              ("b","bb",4),
              ("b","bb",3)
             ] 

In [5]:
input_schema = "col1 string, col2 string, col3 int"

In [7]:
input_df = spark.createDataFrame(input_data,schema = input_schema)

In [8]:
input_df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   a|  aa|   1|
|   a|  aa|   2|
|   b|  bb|   5|
|   b|  bb|   4|
|   b|  bb|   3|
+----+----+----+



In [17]:
from pyspark.sql.functions import collect_list, concat_ws

In [14]:
output_df = input_df.groupBy("col1","col2") \
                    .agg(collect_list("col3").alias("col3"))

In [15]:
output_df.show()

+----+----+---------+
|col1|col2|     col3|
+----+----+---------+
|   a|  aa|   [1, 2]|
|   b|  bb|[5, 4, 3]|
+----+----+---------+



In [18]:
output_df_pipe = input_df.groupBy("col1","col2") \
                    .agg(concat_ws("|",collect_list("col3")).alias("col3"))

In [20]:
output_df_pipe.show()

+----+----+-----+
|col1|col2| col3|
+----+----+-----+
|   a|  aa|  1|2|
|   b|  bb|5|4|3|
+----+----+-----+



### 3. Explode or Unpivot employee id into different rows, one row per emp id

Input JSON: <br>
{ “dept_id”:101,”e_id”:[10101,10102,10103]} <br>
{ “dept_id”:102,”e_id”:[10201,10202]} <br>

In [12]:
from pyspark.sql.functions import explode

In [10]:
input_df = spark.read.json("/user/itv012857/data/input/input.json")

In [11]:
input_df.show(truncate = False)

+-------+---------------------+
|dept_id|e_id                 |
+-------+---------------------+
|101    |[10101, 10102, 10103]|
|102    |[10201, 10202]       |
+-------+---------------------+



In [14]:
output_df = input_df.select("dept_id", explode("e_id").alias("eid"))

In [15]:
output_df.show()

+-------+-----+
|dept_id|  eid|
+-------+-----+
|    101|10101|
|    101|10102|
|    101|10103|
|    102|10201|
|    102|10202|
+-------+-----+



In [18]:
input_df.createOrReplaceTempView("input_date")

In [20]:
spark.sql(""" SELECT
                dept_id,
                explode(e_id) AS eid
                FROM input_date""")

dept_id,eid
101,10101
101,10102
101,10103
102,10201
102,10202


### 4. Avg of Item Sales per Day and Max of Avg Sales overall

In [107]:
from pyspark.sql.functions import to_date, to_timestamp, expr, round, avg, max

In [95]:
orders_schema = "invoice_no long, stock_code string, description string, quantity int, invoice_date string, unit_price double, customer_id long, country string"

In [96]:
orders_df = spark.read.csv("/public/trendytech/datasets/order_data.csv",schema = orders_schema, header = True)

In [97]:
orders_df.printSchema()

root
 |-- invoice_no: long (nullable = true)
 |-- stock_code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- invoice_date: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- country: string (nullable = true)



In [98]:
orders_df.show(truncate = False)

+----------+----------+-----------------------------------+--------+---------------+----------+-----------+--------------+
|invoice_no|stock_code|description                        |quantity|invoice_date   |unit_price|customer_id|country       |
+----------+----------+-----------------------------------+--------+---------------+----------+-----------+--------------+
|536378    |null      |PACK OF 60 DINOSAUR CAKE CASES     |24      |01-12-2010 9.37|0.55      |14688      |United Kingdom|
|536378    |null      |PACK OF 60 PINK PAISLEY CAKE CASES |24      |01-12-2010 9.37|0.55      |14688      |United Kingdom|
|536378    |84991     |60 TEATIME FAIRY CAKE CASES        |24      |01-12-2010 9.37|0.55      |14688      |United Kingdom|
|536378    |84519A    |TOMATO CHARLIE+LOLA COASTER SET    |6       |01-12-2010 9.37|2.95      |14688      |United Kingdom|
|536378    |85183B    |CHARLIE & LOLA WASTEPAPER BIN FLORA|48      |01-12-2010 9.37|1.25      |14688      |United Kingdom|
|536378    |8507

In [99]:
orders_df1 = orders_df.withColumn("invoice_timestamp", to_timestamp("invoice_date","dd-MM-yyyy H.mm")) \
                      .withColumn("invoice_date", to_date("invoice_date","dd-MM-yyyy H.mm")) \
                      .withColumn("amount", round(expr("unit_price * quantity"),2))

In [100]:
orders_df1.show()

+----------+----------+--------------------+--------+------------+----------+-----------+--------------+-------------------+------+
|invoice_no|stock_code|         description|quantity|invoice_date|unit_price|customer_id|       country|  invoice_timestamp|amount|
+----------+----------+--------------------+--------+------------+----------+-----------+--------------+-------------------+------+
|    536378|      null|PACK OF 60 DINOSA...|      24|  2010-12-01|      0.55|      14688|United Kingdom|2010-12-01 09:37:00|  13.2|
|    536378|      null|PACK OF 60 PINK P...|      24|  2010-12-01|      0.55|      14688|United Kingdom|2010-12-01 09:37:00|  13.2|
|    536378|     84991|60 TEATIME FAIRY ...|      24|  2010-12-01|      0.55|      14688|United Kingdom|2010-12-01 09:37:00|  13.2|
|    536378|    84519A|TOMATO CHARLIE+LO...|       6|  2010-12-01|      2.95|      14688|United Kingdom|2010-12-01 09:37:00|  17.7|
|    536378|    85183B|CHARLIE & LOLA WA...|      48|  2010-12-01|      1.25

In [101]:
orders_df_avg_amount = orders_df1.groupBy("description","invoice_date") \
                     .agg(
                             round(avg("amount"),2).alias("avg_amount")
                         )

In [102]:
orders_df_avg_amount.orderBy("invoice_date", ascending = False).show(truncate = False)

+---------------------------------+------------+----------+
|description                      |invoice_date|avg_amount|
+---------------------------------+------------+----------+
|CAKE STAND WHITE TWO TIER LACE   |2011-12-09  |17.7      |
|SET OF 3 BLACK FLYING DUCKS      |2011-12-09  |8.18      |
|RED DINER WALL CLOCK             |2011-12-09  |16.63     |
|GREEN ENAMEL FLOWER RING         |2011-12-09  |2.9       |
|DIAMANTE HAIR GRIP PACK/2 RUBY   |2011-12-09  |1.65      |
|TOXIC AREA  DOOR HANGER          |2011-12-09  |6.32      |
|SPACEBOY CHILDRENS CUP           |2011-12-09  |2.46      |
|12 MESSAGE CARDS WITH ENVELOPES  |2011-12-09  |1.65      |
|PAPER CHAIN KIT RETROSPOT        |2011-12-09  |11.69     |
|VINTAGE BEAD PINK PURSE          |2011-12-09  |3.26      |
|ZINC HEART FLOWER T-LIGHT HOLDER |2011-12-09  |2.46      |
|GLITTER CHRISTMAS TREE WITH BELLS|2011-12-09  |19.56     |
|SLATE TILE NATURAL HANGING       |2011-12-09  |9.48      |
|STAR T-LIGHT HOLDER WILLIE WINKIE|2011-

In [103]:
orders_df1.filter("description = 'SET OF 4 ROSE BOTANICAL CANDLES' AND invoice_date = '2011-12-09'").show()

+----------+----------+--------------------+--------+------------+----------+-----------+--------------+-------------------+------+
|invoice_no|stock_code|         description|quantity|invoice_date|unit_price|customer_id|       country|  invoice_timestamp|amount|
+----------+----------+--------------------+--------+------------+----------+-----------+--------------+-------------------+------+
|    581492|     21620|SET OF 4 ROSE BOT...|       5|  2011-12-09|      3.29|       null|United Kingdom|2011-12-09 10:03:00| 16.45|
|    581516|     21620|SET OF 4 ROSE BOT...|      12|  2011-12-09|      1.25|      14422|United Kingdom|2011-12-09 11:26:00|  15.0|
+----------+----------+--------------------+--------+------------+----------+-----------+--------------+-------------------+------+



In [104]:
print((16.45+15)/2)

15.725


In [106]:
orders_df_avg_amount.printSchema()

root
 |-- description: string (nullable = true)
 |-- invoice_date: date (nullable = true)
 |-- avg_amount: double (nullable = true)



In [108]:
df_max_avg_per_item = orders_df_avg_amount.groupBy("description") \
                                          .agg(max("avg_amount").alias("max_avg_per_item"))

In [111]:
df_max_avg_per_item.show(truncate = False)

+-----------------------------------+----------------+
|description                        |max_avg_per_item|
+-----------------------------------+----------------+
|MAGNETS PACK OF 4 VINTAGE LABELS   |27.75           |
|SILVER FABRIC MIRROR               |29.75           |
|DECORATION SITTING BUNNY           |18.24           |
|VINTAGE LEAF CHOPPING BOARD        |49.72           |
|SET/3 RED GINGHAM ROSE STORAGE BOX |1220.4          |
|WOVEN BERRIES CUSHION COVER        |33.04           |
|10 COLOUR SPACEBOY PEN             |207.36          |
|SET/10 BLUE POLKADOT PARTY CANDLES |305.28          |
|WHITE/PINK MINI CRYSTALS NECKLACE  |19.8            |
|WHITE FRANGIPANI NECKLACE          |19.84           |
|PINK  HONEYCOMB PAPER FAN          |57.82           |
|PAPERWEIGHT KINGS CHOICE           |30.6            |
|SNACK TRAY I LOVE LONDON           |34.2            |
|WHITE CHRYSANTHEMUMS ART FLOWER    |37.8            |
|ANTIQUE MID BLUE FLOWER EARRINGS   |8.5             |
|POTTING S

In [114]:
orders_df_avg_amount.filter("description = 'SILVER FABRIC MIRROR'") \
                    .orderBy("avg_amount", ascending = False) \
                    .show()

+--------------------+------------+----------+
|         description|invoice_date|avg_amount|
+--------------------+------------+----------+
|SILVER FABRIC MIRROR|  2010-12-23|     29.75|
|SILVER FABRIC MIRROR|  2011-11-16|      23.4|
|SILVER FABRIC MIRROR|  2011-11-30|      23.4|
|SILVER FABRIC MIRROR|  2011-11-01|      23.4|
|SILVER FABRIC MIRROR|  2011-11-20|     12.67|
|SILVER FABRIC MIRROR|  2011-07-22|      7.51|
|SILVER FABRIC MIRROR|  2011-09-28|      5.13|
|SILVER FABRIC MIRROR|  2011-07-08|      5.07|
|SILVER FABRIC MIRROR|  2010-12-14|       5.0|
|SILVER FABRIC MIRROR|  2011-01-18|      4.98|
|SILVER FABRIC MIRROR|  2011-11-28|       3.9|
|SILVER FABRIC MIRROR|  2011-03-09|      3.75|
|SILVER FABRIC MIRROR|  2011-01-06|       3.4|
|SILVER FABRIC MIRROR|  2011-10-27|      3.12|
|SILVER FABRIC MIRROR|  2011-11-21|      2.54|
|SILVER FABRIC MIRROR|  2010-12-10|       2.5|
|SILVER FABRIC MIRROR|  2011-11-23|      2.46|
|SILVER FABRIC MIRROR|  2011-11-22|      2.46|
|SILVER FABRI

In [None]:
spark.stop()