### Spark SQL Basic Transformation


In [0]:
%sql
-- NOTE: in this Notebook we dont need to provide %sql as we choosed SQL as the language
SELECT current_date(); 

current_date()
2023-10-04


In [0]:
%fs ls dbfs:/public

path,name,size,modificationTime
dbfs:/public/retail_db/,retail_db/,0,1696232771082
dbfs:/public/retail_db_json/,retail_db_json/,0,1696292619209
dbfs:/public/retail_db_parquet/,retail_db_parquet/,0,0
dbfs:/public/warehouse/,warehouse/,0,1696291507629


In [0]:
%fs ls dbfs:/public/retail_db

path,name,size,modificationTime
dbfs:/public/retail_db/categories/,categories/,0,1696232771583
dbfs:/public/retail_db/customers/,customers/,0,1696232772257
dbfs:/public/retail_db/daily_product_revenue/,daily_product_revenue/,0,1696280507385
dbfs:/public/retail_db/departments/,departments/,0,1696232773479
dbfs:/public/retail_db/order_items/,order_items/,0,1696232775755
dbfs:/public/retail_db/orders/,orders/,0,1696232774387
dbfs:/public/retail_db/products/,products/,0,1696232777898
dbfs:/public/retail_db/schemas.json,schemas.json,4254,1696232778715


In [0]:
%sql
-- Directly reviewing the structure of data, if data is in TEXT, JSON, CSV, PARQUET format
select * from text.`dbfs:/public/retail_db/orders/` LIMIT 10;

value
"1,2013-07-25 00:00:00.0,11599,CLOSED"
"2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT"
"3,2013-07-25 00:00:00.0,12111,COMPLETE"
"4,2013-07-25 00:00:00.0,8827,CLOSED"
"5,2013-07-25 00:00:00.0,11318,COMPLETE"
"6,2013-07-25 00:00:00.0,7130,COMPLETE"
"7,2013-07-25 00:00:00.0,4530,COMPLETE"
"8,2013-07-25 00:00:00.0,2911,PROCESSING"
"9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT"
"10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT"


### Creting temporary view for orders, order_items

In [0]:
%sql
create temporary view orders(
  order_id int,
  order_date string,
  order_customer_id int,
  order_status string
)using csv
options(
  path='dbfs:/public/retail_db/orders/',
  sep=','

)

In [0]:
%sql
desc orders;

col_name,data_type,comment
order_id,int,
order_date,string,
order_customer_id,int,
order_status,string,


In [0]:
%sql
select * from orders limit 10;

order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT


In [0]:
%sql
select * from text.`dbfs:/public/retail_db/order_items/` limit 10;

value
"1,1,957,1,299.98,299.98"
"2,2,1073,1,199.99,199.99"
"3,2,502,5,250.0,50.0"
"4,2,403,1,129.99,129.99"
"5,4,897,2,49.98,24.99"
"6,4,365,5,299.95,59.99"
"7,4,502,3,150.0,50.0"
"8,4,1014,4,199.92,49.98"
"9,5,957,1,299.98,299.98"
"10,5,365,5,299.95,59.99"


In [0]:
%sql
create or replace temporary view order_items( 
  order_item_id int,
  order_item_order_id int,
  order_item_product_id int,
  order_item_quantity int,
  order_item_subtotal float,
  order_item_product_price float
)using csv
options(
  path='dbfs:/public/retail_db/order_items/',
  sep=','
)


In [0]:
%sql
select * from order_items limit 10;

order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99


In [0]:
%sql
describe orders;

col_name,data_type,comment
order_id,int,
order_date,string,
order_customer_id,int,
order_status,string,


In [0]:
%sql
describe order_items;

col_name,data_type,comment
order_item_id,int,
order_item_order_id,int,
order_item_product_id,int,
order_item_quantity,int,
order_item_subtotal,float,
order_item_product_price,float,


In [0]:
%sql
select 
order_date
, oi.order_item_product_id as product_id
, round(sum(oi.order_item_subtotal), 2) as revenue
from orders o
inner join order_items oi on o.order_id = oi.order_item_order_id
where o.order_status in ('COMPLETE', 'CLOSED')
group by 1,2
order by 1, 3 desc
limit 10;

order_date,product_id,revenue
2013-07-25 00:00:00.0,1004,5599.72
2013-07-25 00:00:00.0,191,5099.49
2013-07-25 00:00:00.0,957,4499.7
2013-07-25 00:00:00.0,365,3359.44
2013-07-25 00:00:00.0,1073,2999.85
2013-07-25 00:00:00.0,1014,2798.88
2013-07-25 00:00:00.0,403,1949.85
2013-07-25 00:00:00.0,502,1650.0
2013-07-25 00:00:00.0,627,1079.73
2013-07-25 00:00:00.0,226,599.99


### Write/Save result to DBFS in specified location


In [0]:
%sql
insert overwrite directory 'dbfs:/public/retail_db/daily_product_revenue'
using parquet 
select 
order_date
, oi.order_item_product_id as product_id
, round(sum(oi.order_item_subtotal), 2) as revenue
from orders o
inner join order_items oi on o.order_id = oi.order_item_order_id
where o.order_status in ('COMPLETE', 'CLOSED')
group by 1,2;

In [0]:
%fs ls dbfs:/public/retail_db/daily_product_revenue/

path,name,size,modificationTime
dbfs:/public/retail_db/daily_product_revenue/_committed_3020937405764686670,_committed_3020937405764686670,222,1696391064554
dbfs:/public/retail_db/daily_product_revenue/_committed_6058895720273631727,_committed_6058895720273631727,222,1696388655274
dbfs:/public/retail_db/daily_product_revenue/_committed_7260991865645144740,_committed_7260991865645144740,231,1696280507139
dbfs:/public/retail_db/daily_product_revenue/_started_3020937405764686670,_started_3020937405764686670,0,1696391062382
dbfs:/public/retail_db/daily_product_revenue/part-00000-tid-3020937405764686670-8797117a-fab2-4a6d-b84e-2d819a85239f-33-1-c000.snappy.parquet,part-00000-tid-3020937405764686670-8797117a-fab2-4a6d-b84e-2d819a85239f-33-1-c000.snappy.parquet,38164,1696391064164


In [0]:
%sql
-- Directly reviewing the structure of data, if data is in TEXT/PARQUET format
select * from parquet.`dbfs:/public/retail_db/daily_product_revenue/`
order by order_date, revenue desc 
limit 10;

order_date,product_id,revenue
2013-07-25 00:00:00.0,1004,5599.72
2013-07-25 00:00:00.0,191,5099.49
2013-07-25 00:00:00.0,957,4499.7
2013-07-25 00:00:00.0,365,3359.44
2013-07-25 00:00:00.0,1073,2999.85
2013-07-25 00:00:00.0,1014,2798.88
2013-07-25 00:00:00.0,403,1949.85
2013-07-25 00:00:00.0,502,1650.0
2013-07-25 00:00:00.0,627,1079.73
2013-07-25 00:00:00.0,226,599.99
