##### Getting Started

In [0]:
%fs ls dbfs:/

path,name,size,modificationTime
dbfs:/Volume/,Volume/,0,0
dbfs:/Volumes/,Volumes/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/public/,public/,0,0
dbfs:/volume/,volume/,0,0
dbfs:/volumes/,volumes/,0,0


In [0]:
print('hello world')

hello world


In [0]:
spark.sql('SELECT current_date').show() # spark.sql() --> if you want to use Python to run Spark SQL queries

+--------------+
|current_date()|
+--------------+
|    2023-10-02|
+--------------+



In [0]:
%sql -- %sql is magic command which take care of running Spark SQL queries withouth worrying about Python code
SELECT current_date();

current_date()
2023-10-02


In [0]:
%fs ls dbfs:/public/retail_db/

path,name,size,modificationTime
dbfs:/public/retail_db/categories/,categories/,0,1696232771583
dbfs:/public/retail_db/customers/,customers/,0,1696232772257
dbfs:/public/retail_db/daily_product_revenue/,daily_product_revenue/,0,1696280507385
dbfs:/public/retail_db/departments/,departments/,0,1696232773479
dbfs:/public/retail_db/order_items/,order_items/,0,1696232775755
dbfs:/public/retail_db/orders/,orders/,0,1696232774387
dbfs:/public/retail_db/products/,products/,0,1696232777898
dbfs:/public/retail_db/schemas.json,schemas.json,4254,1696232778715


In [0]:
%fs ls dbfs:/public/retail_db/orders/

path,name,size,modificationTime
dbfs:/public/retail_db/orders/part-00000.txt,part-00000.txt,2999944,1696232775527


In [0]:
%sql
-- We can directly create the view against the DBFS location
create temporary view orders(
  order_id int,
  order_date string,
  order_customer_id int,
  order_status string
) using csv
options(path='dbfs:/public/retail_db/orders/part-00000.txt')

In [0]:
%sql
select * from orders limit 10;

order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT


In [0]:
spark.sql("select * from orders").show(10)

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
+--------+--------------------+-----------------+---------------+
only showing top 10 rows



#### How to access a Data Frame from  SQL
- `temporary view` : session scoped : ie only valid within a Spark Session
- `global temporary view` : application scoped : ie available within the entire application 

In [0]:
%fs ls dbfs:/public/retail_db/orders/ 

path,name,size,modificationTime
dbfs:/public/retail_db/orders/part-00000.txt,part-00000.txt,2999944,1696232775527


In [0]:
orders_schema = 'order_id INT, order_date DATE, order_customer_id INT, order_status STRING'
orders_df = spark.read.csv('dbfs:/public/retail_db/orders/', schema = orders_schema)
orders_df.show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [0]:
# 1. Temporary View 

orders_df.createOrReplaceTempView("orders_v")

In [0]:
%sql
SELECT * FROM orders_v limit 5;   -- running from SQL shell

order_id,order_date,order_customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE


In [0]:
 # -- running from spark.sql() function which you can use within Python:  returns Data Frame

spark.sql("SELECT * FROM orders_v WHERE order_status = 'CLOSED'").show(5)  


+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       1|2013-07-25|            11599|      CLOSED|
|       4|2013-07-25|             8827|      CLOSED|
|      12|2013-07-25|             1837|      CLOSED|
|      18|2013-07-25|             1205|      CLOSED|
|      24|2013-07-25|            11441|      CLOSED|
+--------+----------+-----------------+------------+
only showing top 5 rows



In [0]:
# when to use what 
# spark.sql() gives you the ability to put that data into a Data Frame 
# + Other benifit you can pass variable (numeric working, string not working check later..1) 

cust_id = 11599
spark.sql(f'SELECT * FROM orders_v WHERE order_customer_id ={cust_id}').show(5)


+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       1|2013-07-25|            11599|      CLOSED|
|   11397|2013-10-03|            11599|    COMPLETE|
|   23908|2013-12-20|            11599|    COMPLETE|
|   53545|2014-06-27|            11599|     PENDING|
|   59911|2013-10-17|            11599|  PROCESSING|
+--------+----------+-----------------+------------+



In [0]:
# Global View


In [0]:
orders_df.createOrReplaceGlobalTempView("orders_gv")

In [0]:
%sql
-- SELECT * FROM orders_gv LIMIT 5;   -- ERROR becuase Spark will register that view aginst Database called global_temp 

In [0]:
%sql
SHOW TABLES IN global_temp;

database,tableName,isTemporary
global_temp,orders_gv,True
,orders_v,True


In [0]:
%sql
SELECT * FROM global_temp.orders_gv LIMIT 5; 

order_id,order_date,order_customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE


In [0]:
spark.sql("SELECT * FROM global_temp.orders_gv").show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



#### Views
- `temp view`, `global temp view`, `permanent view`
- Creating view on top of table themselvels rather that creating view on top of Data Frame
- Permanent view: Registered on Hive metastore : access even you restart the cluster or from another cluster

In [0]:
%sql
SHOW DATABASES;

databaseName
default
itversity_demo
itversity_retail_db


In [0]:
%sql
USE itversity_retail_db;
SHOW TABLES;

database,tableName,isTemporary
itversity_retail_db,daily_revenue,False
itversity_retail_db,daily_revenue_stg,False
itversity_retail_db,dept,False
itversity_retail_db,emp,False
itversity_retail_db,order_items,False
itversity_retail_db,orders,False
itversity_retail_db,users,False
,orders_v,True


In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW emp_v AS
SELECT * FROM emp;

In [0]:
%sql
CREATE OR REPLACE GLOBAL TEMPORARY VIEW emp_gv AS
SELECT * FROM emp;

In [0]:
%sql
CREATE OR REPLACE VIEW emp_pv AS
SELECT * FROM emp;

In [0]:
%sql
SELECT * FROM emp_v limit 3;

emp_id,name,superior_emp_id,year_joined,emp_dept_id,gender,salary
1,Smith,-1,2018,10,M,3000.0
2,Rose,1,2010,20,M,4000.0
3,Williams,1,2010,10,M,1000.0


In [0]:
%sql
SELECT * FROM global_temp.emp_gv limit 3;

emp_id,name,superior_emp_id,year_joined,emp_dept_id,gender,salary
1,Smith,-1,2018,10,M,3000.0
2,Rose,1,2010,20,M,4000.0
3,Williams,1,2010,10,M,1000.0


In [0]:
%sql
SELECT * FROM emp_pv limit 3;

emp_id,name,superior_emp_id,year_joined,emp_dept_id,gender,salary
1,Smith,-1,2018,10,M,3000.0
2,Rose,1,2010,20,M,4000.0
3,Williams,1,2010,10,M,1000.0


In [0]:
spark.sql("SELECT * FROM emp_v").show(2)
spark.sql("SELECT * FROM global_temp.emp_gv").show(2)
spark.sql("SELECT * FROM emp_pv").show(2)

+------+-----+---------------+-----------+-----------+------+------+
|emp_id| name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|     1|Smith|             -1|       2018|         10|     M|3000.0|
|     2| Rose|              1|       2010|         20|     M|4000.0|
+------+-----+---------------+-----------+-----------+------+------+
only showing top 2 rows

+------+-----+---------------+-----------+-----------+------+------+
|emp_id| name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|     1|Smith|             -1|       2018|         10|     M|3000.0|
|     2| Rose|              1|       2010|         20|     M|4000.0|
+------+-----+---------------+-----------+-----------+------+------+
only showing top 2 rows

+------+-----+---------------+-----------+-----------+------+------+
|emp_id| name|superior_emp_id|year_joined|emp_dept_id

In [0]:
%sql
SHOW TABLES;

database,tableName,isTemporary
itversity_retail_db,daily_revenue,False
itversity_retail_db,daily_revenue_stg,False
itversity_retail_db,dept,False
itversity_retail_db,emp,False
itversity_retail_db,emp_pv,False
itversity_retail_db,order_items,False
itversity_retail_db,orders,False
itversity_retail_db,users,False
,emp_v,True
,orders_v,True
