<H1 align = 'Center'> Spark Higher Level APIs </H1>
<H3> => Assignment 1 : Spark Tables </H3>
<BR>

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder. \
config("spark.ui.port","0"). \
config("spark.sql.warehouse.dir","/user/itv012857/warehouse"). \
enableHiveSupport(). \
master("yarn"). \
getOrCreate()

In [None]:
spark.sql("create database itv012857_groceries")

In [30]:
spark.sql("use itv012857_groceries")

### 1. Create a Managed Spark table using the file - /public/trendytech/groceries.csv

In [31]:
spark.sql("DROP TABLE groceries;");

In [32]:
spark.sql("CREATE TABLE groceries(order_id string, \
            location string, item string, order_date date, \
            quantity int) \
          ")

In [33]:
spark.sql("describe extended groceries").show(truncate = False)

+----------------------------+---------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                              |comment|
+----------------------------+---------------------------------------------------------------------------------------+-------+
|order_id                    |string                                                                                 |null   |
|location                    |string                                                                                 |null   |
|item                        |string                                                                                 |null   |
|order_date                  |date                                                                                   |null   |
|quantity                    |int                                                                              

In [34]:
groceries_df = spark.read.csv("/public/trendytech/groceries.csv", header = "true")

In [35]:
groceries_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- item: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- quantity: string (nullable = true)



In [36]:
groceries_df.show()

+--------+---------+--------+----------+--------+
|order_id| location|    item|order_date|quantity|
+--------+---------+--------+----------+--------+
|      o1|  Seattle| Bananas|01/01/2017|       7|
|      o2|     Kent|  Apples|02/01/2017|      20|
|      o3| Bellevue| Flowers|02/01/2017|      10|
|      o4|  Redmond|    Meat|03/01/2017|      40|
|      o5|  Seattle|Potatoes|04/01/2017|       9|
|      o6| Bellevue|   Bread|04/01/2017|       5|
|      o7|  Redmond|   Bread|05/01/2017|       5|
|      o8| Issaquah|   Onion|05/01/2017|       4|
|      o9|  Redmond|  Cheese|05/01/2017|      15|
|     o10| Issaquah|   Onion|06/01/2017|       4|
|     o11|   Renton|   Bread|05/01/2017|       5|
|     o12| Issaquah|   Onion|07/01/2017|       4|
|     o13|Sammamish|   Bread|07/01/2017|       5|
|     o14| Issaquah|  Tomato|07/01/2017|       6|
|     o15| Issaquah|    Meat|08/01/2017|       3|
|     o16| Issaquah|    Meat|09/01/2017|       5|
|     o17| Issaquah|    Meat|10/01/2017|       6|


In [37]:
groceries_df.createOrReplaceTempView("groceries_view")

In [38]:
spark.sql("describe table groceries_view")

col_name,data_type,comment
order_id,string,
location,string,
item,string,
order_date,string,
quantity,string,


In [39]:
spark.sql("INSERT INTO groceries \
           SELECT order_id, location, item, to_date(order_date,'dd/mm/yyyy'), CAST(quantity as int) \
              FROM groceries_view")

In [40]:
spark.sql("SELECT * FROM groceries")

order_id,location,item,order_date,quantity
o1,Seattle,Bananas,2017-01-01,7
o2,Kent,Apples,2017-01-02,20
o3,Bellevue,Flowers,2017-01-02,10
o4,Redmond,Meat,2017-01-03,40
o5,Seattle,Potatoes,2017-01-04,9
o6,Bellevue,Bread,2017-01-04,5
o7,Redmond,Bread,2017-01-05,5
o8,Issaquah,Onion,2017-01-05,4
o9,Redmond,Cheese,2017-01-05,15
o10,Issaquah,Onion,2017-01-06,4


### 2. Create an External Spark table using the same file - /public/trendytech/groceries.csv

In [41]:
spark.sql("DROP TABLE ext_groceries")

In [42]:
spark.sql("CREATE TABLE ext_groceries(order_id string, \
            location string, item string, order_date string, \
            quantity int) \
            USING CSV \
            LOCATION '/public/trendytech/groceries.csv' \
            OPTIONS ( header 'true') \
          ")

In [43]:
spark.sql("SELECT * FROM ext_groceries")

order_id,location,item,order_date,quantity
o1,Seattle,Bananas,01/01/2017,7
o2,Kent,Apples,02/01/2017,20
o3,Bellevue,Flowers,02/01/2017,10
o4,Redmond,Meat,03/01/2017,40
o5,Seattle,Potatoes,04/01/2017,9
o6,Bellevue,Bread,04/01/2017,5
o7,Redmond,Bread,05/01/2017,5
o8,Issaquah,Onion,05/01/2017,4
o9,Redmond,Cheese,05/01/2017,15
o10,Issaquah,Onion,06/01/2017,4


In [44]:
spark.sql("describe extended ext_groceries").show(truncate = False)

+----------------------------+-------------------------------------------------------------+-------+
|col_name                    |data_type                                                    |comment|
+----------------------------+-------------------------------------------------------------+-------+
|order_id                    |string                                                       |null   |
|location                    |string                                                       |null   |
|item                        |string                                                       |null   |
|order_date                  |string                                                       |null   |
|quantity                    |int                                                          |null   |
|                            |                                                             |       |
|# Detailed Table Information|                                                             

<H3> 3. Create a Managed & External Spark table using the JSON file - <H4>/public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json <BR>

In [5]:
orders_df = spark.read.json("/public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json")

In [6]:
orders_df.show(truncate = False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
|7130       |2013-07-25 00:00:00.0|6       |COMPLETE       |
|4530       |2013-07-25 00:00:00.0|7       |COMPLETE       |
|2911       |2013-07-25 00:00:00.0|8       |PROCESSING     |
|5657       |2013-07-25 00:00:00.0|9       |PENDING_PAYMENT|
|5648       |2013-07-25 00:00:00.0|10      |PENDING_PAYMENT|
|918        |2013-07-25 00:00:00.0|11      |PAYMENT_REVIEW |
|1837       |2013-07-25 00:00:00.0|12      |CLOSED         |
|9149       |2013-07-25 00:00:00.0|13      |PENDING_PAYMENT|
|9842       |2013-07-25 

In [7]:
orders_df.createOrReplaceTempView("orders_view")

In [8]:
spark.sql("select * from orders_view")

customer_id,order_date,order_id,order_status
11599,2013-07-25 00:00:...,1,CLOSED
256,2013-07-25 00:00:...,2,PENDING_PAYMENT
12111,2013-07-25 00:00:...,3,COMPLETE
8827,2013-07-25 00:00:...,4,CLOSED
11318,2013-07-25 00:00:...,5,COMPLETE
7130,2013-07-25 00:00:...,6,COMPLETE
4530,2013-07-25 00:00:...,7,COMPLETE
2911,2013-07-25 00:00:...,8,PROCESSING
5657,2013-07-25 00:00:...,9,PENDING_PAYMENT
5648,2013-07-25 00:00:...,10,PENDING_PAYMENT


In [9]:
spark.sql("SHOW DATABASES").filter("namespace like 'itv012857%'")

namespace
itv012857_groceries
itv012857_orders


In [None]:
spark.sql("CREATE DATABASE itv012857_orders")

In [10]:
spark.sql("USE itv012857_orders")

In [16]:
spark.sql("DROP TABLE orders")

In [17]:
spark.sql("SHOW TABLES")

database,tableName,isTemporary
,orders_view,True


In [18]:
spark.sql("CREATE TABLE orders(order_id string, \
            order_date string, \
            customer_id string, \
            order_status string) \
          ")

In [15]:
spark.sql("SHOW TABLES")

database,tableName,isTemporary
itv012857_orders,orders,False
,orders_view,True


In [19]:
spark.sql("describe extended orders").show(truncate = False)

+----------------------------+---------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                        |comment|
+----------------------------+---------------------------------------------------------------------------------+-------+
|order_id                    |string                                                                           |null   |
|order_date                  |string                                                                           |null   |
|customer_id                 |string                                                                           |null   |
|order_status                |string                                                                           |null   |
|                            |                                                                                 |       |
|# Detailed Table Information|  

In [20]:
spark.sql("SELECT * FROM orders")

order_id,order_date,customer_id,order_status


In [22]:
spark.sql("INSERT INTO orders SELECT * FROM orders_view")

In [23]:
spark.sql("SELECT * FROM orders")

order_id,order_date,customer_id,order_status
11599,2013-07-25 00:00:...,1,CLOSED
256,2013-07-25 00:00:...,2,PENDING_PAYMENT
12111,2013-07-25 00:00:...,3,COMPLETE
8827,2013-07-25 00:00:...,4,CLOSED
11318,2013-07-25 00:00:...,5,COMPLETE
7130,2013-07-25 00:00:...,6,COMPLETE
4530,2013-07-25 00:00:...,7,COMPLETE
2911,2013-07-25 00:00:...,8,PROCESSING
5657,2013-07-25 00:00:...,9,PENDING_PAYMENT
5648,2013-07-25 00:00:...,10,PENDING_PAYMENT


In [25]:
spark.sql("CREATE TABLE orders_external_table(order_id string, \
            order_date string, \
            customer_id string, \
            order_status string) \
            USING JSON \
            LOCATION '/public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json' \
          ")

In [27]:
spark.sql("describe extended orders_external_table").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|            order_id|              string|   null|
|          order_date|              string|   null|
|         customer_id|              string|   null|
|        order_status|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|    itv012857_orders|       |
|               Table|orders_external_t...|       |
|               Owner|           itv012857|       |
|        Created Time|Wed Jul 10 02:07:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.1.2|       |
|                Type|            EXTERNAL|       |
|            Provider|                JSON|       |
|            Location|hdfs://m01.itvers...|       |
|       Serde Library|org.apache.hadoop...|       |
|         In

In [28]:
spark.sql("select * from orders_external_table")

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:...,11599,CLOSED
2,2013-07-25 00:00:...,256,PENDING_PAYMENT
3,2013-07-25 00:00:...,12111,COMPLETE
4,2013-07-25 00:00:...,8827,CLOSED
5,2013-07-25 00:00:...,11318,COMPLETE
6,2013-07-25 00:00:...,7130,COMPLETE
7,2013-07-25 00:00:...,4530,COMPLETE
8,2013-07-25 00:00:...,2911,PROCESSING
9,2013-07-25 00:00:...,5657,PENDING_PAYMENT
10,2013-07-25 00:00:...,5648,PENDING_PAYMENT


In [None]:
spark.stop()