##### Create Delta Tables using Spark SQl

In [0]:
%sql
-- Overview of Supported Provides: CSV,JSON,ORC,PARQUET,AVRO,DELTA(flavor on top of Parquet: facilitates INSERT,UPDATE,DELETE)
DROP TABLE IF EXISTS ORDERS;
CREATE TABLE orders(
  order_id INT,
  order_date DATE,
  order_customer_id INT,
  order_status STRING
) USING DELTA -- PARQUET, CSV, JSON


In [0]:
%sql
DROP DATABASE IF EXISTS itversity_retail_db CASCADE -- CASCADE if the database is not empty

In [0]:
%sql
SET spark.sql.warehouse.dir; -- Default is dbfs:/user/hive/warehouse

key,value
spark.sql.warehouse.dir,*********(redacted)


In [0]:
%fs ls dbfs:/user/hive/warehouse

path,name,size,modificationTime
dbfs:/user/hive/warehouse/itversity_demo.db/,itversity_demo.db/,0,1696326285903
dbfs:/user/hive/warehouse/orders/,orders/,0,0
dbfs:/user/hive/warehouse/sales/,sales/,0,0


In [0]:
%sql
CREATE DATABASE IF NOT EXISTS itversity_retail_db;

In [0]:
%fs ls dbfs:/user/hive/warehouse

path,name,size,modificationTime
dbfs:/user/hive/warehouse/itversity_demo.db/,itversity_demo.db/,0,1696326285903
dbfs:/user/hive/warehouse/itversity_retail_db.db/,itversity_retail_db.db/,0,1696387967233
dbfs:/user/hive/warehouse/orders/,orders/,0,0
dbfs:/user/hive/warehouse/sales/,sales/,0,0


In [0]:
%sql
DESCRIBE DATABASE itversity_retail_db;

database_description_item,database_description_value
Catalog Name,spark_catalog
Namespace Name,itversity_retail_db
Comment,
Location,dbfs:/user/hive/warehouse/itversity_retail_db.db
Owner,root


In [0]:
%sql
-- specifying the location while creating database
DROP DATABASE IF EXISTS itversity_retail_db CASCADE;
CREATE DATABASE IF NOT EXISTS itversity_retail_db LOCATION 'dbfs:/public/warehouse/itversity_retail_db.db';

In [0]:
%sql
DESCRIBE DATABASE itversity_retail_db;

database_description_item,database_description_value
Catalog Name,spark_catalog
Namespace Name,itversity_retail_db
Comment,
Location,dbfs:/public/warehouse/itversity_retail_db.db
Owner,root


In [0]:
%fs ls dbfs:/public/warehouse/itversity_retail_db.db

path,name,size,modificationTime
dbfs:/public/warehouse/itversity_retail_db.db/order_items/,order_items/,0,0


### Creating Manged Tables

In [0]:
%sql
-- Creating Manged Tables
DROP DATABASE IF EXISTS itversity_retail_db CASCADE;
CREATE DATABASE IF NOT EXISTS itversity_retail_db;
USE itversity_retail_db;
SELECT current_database();

current_database()
itversity_retail_db


In [0]:
%sql
DROP TABLE IF EXISTS itversity_retail_db.orders;
CREATE TABLE itversity_retail_db.orders(
  order_id BIGINT,
  order_date STRING,
  order_customer_id BIGINT,
  order_status STRING
)USING DELTA;

-- Note: is you say USING PARQUET the table will be created as PARQUET formatted table and so on..

In [0]:
%fs ls dbfs:/user/hive/warehouse/itversity_retail_db.db

path,name,size,modificationTime
dbfs:/user/hive/warehouse/itversity_retail_db.db/orders/,orders/,0,0


In [0]:
%sql
DESCRIBE itversity_retail_db.orders;

col_name,data_type,comment
order_id,bigint,
order_date,string,
order_customer_id,bigint,
order_status,string,


In [0]:
%sql
DESCRIBE FORMATTED itversity_retail_db.orders;

col_name,data_type,comment
order_id,bigint,
order_date,string,
order_customer_id,bigint,
order_status,string,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,itversity_retail_db,
Table,orders,
Created Time,Wed Oct 04 02:53:58 UTC 2023,


In [0]:
%sql
SHOW CREATE TABLE itversity_retail_db.orders;

createtab_stmt
"CREATE TABLE spark_catalog.itversity_retail_db.orders (  order_id BIGINT,  order_date STRING,  order_customer_id BIGINT,  order_status STRING) USING delta TBLPROPERTIES (  'delta.minReaderVersion' = '1',  'delta.minWriterVersion' = '2')"


In [0]:
%fs ls dbfs:/public/

path,name,size,modificationTime
dbfs:/public/retail_db/,retail_db/,0,1696232771082
dbfs:/public/retail_db_json/,retail_db_json/,0,1696292619209
dbfs:/public/retail_db_parquet/,retail_db_parquet/,0,0
dbfs:/public/warehouse/,warehouse/,0,1696291507629


In [0]:
%fs ls dbfs:/public/retail_db_json/orders

path,name,size,modificationTime
dbfs:/public/retail_db_json/orders/part-00000.txt,part-00000.txt,7477339,1696292631031


In [0]:
# %fs head dbfs:/public/retail_db_json/orders/part-00000.txt

In [0]:
%sql
SELECT * FROM JSON.`dbfs:/public/retail_db_json/orders/part-00000.txt` LIMIT 10;

order_customer_id,order_date,order_id,order_status
11599,2013-07-25 00:00:00.0,1,CLOSED
256,2013-07-25 00:00:00.0,2,PENDING_PAYMENT
12111,2013-07-25 00:00:00.0,3,COMPLETE
8827,2013-07-25 00:00:00.0,4,CLOSED
11318,2013-07-25 00:00:00.0,5,COMPLETE
7130,2013-07-25 00:00:00.0,6,COMPLETE
4530,2013-07-25 00:00:00.0,7,COMPLETE
2911,2013-07-25 00:00:00.0,8,PROCESSING
5657,2013-07-25 00:00:00.0,9,PENDING_PAYMENT
5648,2013-07-25 00:00:00.0,10,PENDING_PAYMENT


In [0]:
%sql
DROP TABLE IF EXISTS itversity_retail_db.orders;
CREATE TABLE itversity_retail_db.orders(
  order_id BIGINT,
  order_date STRING,
  order_customer_id BIGINT,
  order_status STRING
)USING DELTA;

In [0]:
%sql
COPY INTO itversity_retail_db.orders
FROM 'dbfs:/public/retail_db_json/orders/part-00000.txt'
FILEFORMAT =  JSON;

-- Note: to insert data: you can either use ISERT or COPY command

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
68883,68883,0


### Validating

In [0]:
%fs ls dbfs:/user/hive/warehouse/itversity_retail_db.db/orders/

path,name,size,modificationTime
dbfs:/user/hive/warehouse/itversity_retail_db.db/orders/_delta_log/,_delta_log/,0,1696388107916
dbfs:/user/hive/warehouse/itversity_retail_db.db/orders/part-00000-1034503d-e16e-4d77-a25d-3e892eea6604-c000.snappy.parquet,part-00000-1034503d-e16e-4d77-a25d-3e892eea6604-c000.snappy.parquet,294544,1696388124079
dbfs:/user/hive/warehouse/itversity_retail_db.db/orders/part-00001-65ad9b35-4878-4714-9f70-5b6548853eff-c000.snappy.parquet,part-00001-65ad9b35-4878-4714-9f70-5b6548853eff-c000.snappy.parquet,241619,1696388124071


In [0]:
%sql
select count(*) from orders;

count(1)
68883


#### Creating External Table

In [0]:
%sql
DROP TABLE IF EXISTS order_items;
CREATE EXTERNAL TABLE order_items (
  order_item BIGINT,
  order_item_order_id BIGINT,
  order_item_product_id BIGINT,
  order_item_quantity BIGINT,
  order_item_subtotal DOUBLE,
  order_item_product_price DOUBLE
)USING DELTA
OPTIONS(
  path='dbfs:/public/warehouse/itversity_retail_db.db/order_items' -- specifying ext. location, it creates if path doesn't exists
)

In [0]:
%fs ls dbfs:/public/warehouse/itversity_retail_db.db/order_items

path,name,size,modificationTime
dbfs:/public/warehouse/itversity_retail_db.db/order_items/_delta_log/,_delta_log/,0,1696388184703


In [0]:
%sql
DESCRIBE FORMATTED order_items;

col_name,data_type,comment
order_item,bigint,
order_item_order_id,bigint,
order_item_product_id,bigint,
order_item_quantity,bigint,
order_item_subtotal,double,
order_item_product_price,double,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,itversity_retail_db,


In [0]:
%sql
INSERT INTO order_items 
SELECT order_item_id
, order_item_order_id
, order_item_product_id
, order_item_quantity
, order_item_subtotal
, order_item_product_price
FROM JSON.`dbfs:/public/retail_db_json/order_items`; 

num_affected_rows,num_inserted_rows
172198,172198


In [0]:
%sql
show tables;

database,tableName,isTemporary
itversity_retail_db,order_items,False
itversity_retail_db,orders,False


### Validate

In [0]:
%sql
select * from itversity_retail_db.order_items limit 10

order_item,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99


In [0]:
%sql
select count(*)  from itversity_retail_db.order_items

count(1)
172198


In [0]:
%sql
-- TASK: Revenue for each order_id

SELECT order_item_order_id, ROUND(SUM(order_item_subtotal), 2) AS order_revenue
FROM order_items
GROUP BY 1
ORDER BY 1 
limit 10;

order_item_order_id,order_revenue
1,299.98
2,579.98
4,699.85
5,1129.86
7,579.92
8,729.84
9,599.96
10,651.92
11,919.79
12,1299.87


In [0]:

# CRUD (CREATE, SELECT, INSERT, UPDATE, DELETE): 
# DELTA supports CRUD + Merge operations 
# JSON, CSV, PARQUET, AVRO doesn't support CRUD operations
# DELTA comes out of the box in Databricks, no need to install any additional libraries
# we typically use DELTA when it comes to Databricks based Spark Clusters