In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

**Database & Table Tasks**

Create a new database named sales_db.

In [2]:
spark.sql("create database salesdb")

DataFrame[]

2. Set the current database to sales_db.

In [3]:
spark.sql("use salesdb")

DataFrame[]

3. Create table product_sales

In [4]:
spark.sql("""
create table product_sales (
ProductID int,
ProductName varchar(30),
Category string,
Price double,
Quantity int,
SaleDate date)""")

DataFrame[]

4. Insert at least 5 rows into product_sales

In [6]:
spark.sql("""
insert into product_sales values
    (1, 'Smartwatch', 'electronics', 999.99, 2, cast('2025-01-15' as date)),
    (2, 'Mobile', 'electronics', 799.99, 3, cast('2023-05-12' as date)),
    (3, 'Spoon', 'Utensil', 249.99, 5, cast('2025-11-12' as date)),
    (4, 'coffee maker', 'appliances', 129.99, 4, cast('2025-01-24' as date)),
    (5, 'Car', 'Vehicle', 200, 6, cast('2024-01-19' as date))""")

DataFrame[]

 **Query Tasks**
 5. Select all records from product_sales.

In [8]:
spark.sql("select * from product_sales").show()

+---------+------------+-----------+------+--------+----------+
|ProductID| ProductName|   Category| Price|Quantity|  SaleDate|
+---------+------------+-----------+------+--------+----------+
|        1|  Smartwatch|electronics|999.99|       2|2025-01-15|
|        2|      Mobile|electronics|799.99|       3|2023-05-12|
|        3|       Spoon|    Utensil|249.99|       5|2025-11-12|
|        4|coffee maker| appliances|129.99|       4|2025-01-24|
|        5|         Car|    Vehicle| 200.0|       6|2024-01-19|
+---------+------------+-----------+------+--------+----------+



6. Retrieve products where price is above 500

In [9]:
spark.sql("select * from product_sales where price > 500").show()

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|        1| Smartwatch|electronics|999.99|       2|2025-01-15|
|        2|     Mobile|electronics|799.99|       3|2023-05-12|
+---------+-----------+-----------+------+--------+----------+



7. Calculate total sale amount (Price * Quantity) for each product

In [10]:
spark.sql("""
Select ProductID, ProductName, Price, Quantity, (Price * Quantity) AS TotalAmount
from product_sales""").show()

+---------+------------+------+--------+------------------+
|ProductID| ProductName| Price|Quantity|       TotalAmount|
+---------+------------+------+--------+------------------+
|        1|  Smartwatch|999.99|       2|           1999.98|
|        2|      Mobile|799.99|       3|2399.9700000000003|
|        3|       Spoon|249.99|       5|           1249.95|
|        4|coffee maker|129.99|       4|            519.96|
|        5|         Car| 200.0|       6|            1200.0|
+---------+------------+------+--------+------------------+



8. Find the number of products sold in each Category

In [14]:
spark.sql("""select Category, count(*) as ProductCount
from product_sales
GROUP BY Category""").show()

+-----------+------------+
|   Category|ProductCount|
+-----------+------------+
|electronics|           2|
|    Utensil|           1|
| appliances|           1|
|    Vehicle|           1|
+-----------+------------+



Q9. Sort products by total sales in descending order

In [16]:
spark.sql("""
select productid, productname, (price * quantity) as totalsales
from product_sales
order by totalsales desc""").show()

+---------+------------+------------------+
|productid| productname|        totalsales|
+---------+------------+------------------+
|        2|      Mobile|2399.9700000000003|
|        1|  Smartwatch|           1999.98|
|        3|       Spoon|           1249.95|
|        5|         Car|            1200.0|
|        4|coffee maker|            519.96|
+---------+------------+------------------+



**Temporary View Tasks**

Q10. Create a PySpark DataFrame with dummy product data

In [20]:
from pyspark.sql import Row
data = [
    Row(OrderID=101, ProductID=1, Quantity=2),
    Row(OrderID=102, ProductID=2, Quantity=1),
    Row(OrderID=103, ProductID=3, Quantity=3),
    Row(OrderID=104, ProductID=4, Quantity=2),
    Row(OrderID=105, ProductID=5, Quantity=1)
]
df = spark.createDataFrame(data)

# 11) Register it as a temporary view called temp_orders
df.createOrReplaceTempView("temp_orders")

# 12) Run a SQL query to filter temp_orders where quantity > 1
spark.sql("SELECT * from temp_orders where Quantity > 1").show()

+-------+---------+--------+
|OrderID|ProductID|Quantity|
+-------+---------+--------+
|    101|        1|       2|
|    103|        3|       3|
|    104|        4|       2|
+-------+---------+--------+



**Global View Tasks**

In [22]:
# 13. Create a global temp view from a PySpark DataFrame named global_orders
df.createOrReplaceGlobalTempView("global_orders")

# 14. Run a SQL query on the global view from another notebook cell/session
spark.sql("select * from global_temp.global_orders").show()

+-------+---------+--------+
|OrderID|ProductID|Quantity|
+-------+---------+--------+
|    101|        1|       2|
|    102|        2|       1|
|    103|        3|       3|
|    104|        4|       2|
|    105|        5|       1|
+-------+---------+--------+



**Join Tasks**

In [29]:
#15. Create a second table customer_details with: CustomerID, Name, Gender, City, SignupDate
spark.sql("""
create table customer_details (
customerid int,
name string,
gender string,
city string,
signupdate date)""")
# 16. Insert at least 3 records into customer_details
spark.sql("""
insert into customer_details values
(1, 'Fathima Zahira', 'Female', 'new york', cast('2024-01-01' as date)),
(2, 'Rizza', 'female', 'los angeles', cast('2025-11-12' as date)),
(3, 'bobby knuckles', 'male', 'chicago', cast('2023-11-05' as date))""")

# 17. Write a SQL join between product_sales and customer_details
spark.sql("""
select p.productid,p.productname,c.name as customername,c.city
from product_sales p
join customer_details c on p.productid = c.customerid""").show()

# 18. List customers who bought more than 2 products
spark.sql("""
select c.customerid,c.name, sum(p.quantity) as totalproducts
from customer_details c
join product_sales p on c.customerid = p.productid
group by c.customerid, c.name
having sum(p.quantity) > 2 """).show()

++
||
++
++

+---------+-----------+--------------+-----------+
|productid|productname|  customername|       city|
+---------+-----------+--------------+-----------+
|        1| Smartwatch|Fathima Zahira|   new york|
|        2|     Mobile|         Rizza|los angeles|
|        3|      Spoon|bobby knuckles|    chicago|
+---------+-----------+--------------+-----------+

+----------+--------------+-------------+
|customerid|          name|totalproducts|
+----------+--------------+-------------+
|         2|         Rizza|            3|
|         3|bobby knuckles|            5|
+----------+--------------+-------------+



 **View & Summary Tasks**

In [31]:
#19. Create a SQL view sales_summary that includes: ProductName, Price, Quantity, Total = Price * Quantity
spark.sql("""
create or replace view sales_summary as
select productname, price, quantity, (price * quantity) as total
from product_sales""")

#20. Query the view for records with Total > 1000 .
spark.sql("select * from sales_summary where total > 1000").show()

+-----------+------+--------+------------------+
|productname| price|quantity|             total|
+-----------+------+--------+------------------+
| Smartwatch|999.99|       2|           1999.98|
|     Mobile|799.99|       3|2399.9700000000003|
|      Spoon|249.99|       5|           1249.95|
|        Car| 200.0|       6|            1200.0|
+-----------+------+--------+------------------+



**Clean Up Tasks**

In [32]:
#21. Drop the view sales_summary .
spark.sql("drop view if exists sales_summary")
#22. Drop the tables product_sales and customer_details .
spark.sql("drop table if exists product_sales")
spark.sql("drop table if exists customer_details")
#23. Drop the database sales_db .
spark.sql("drop database if exists sales_db")

DataFrame[]