In [4]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("SQL Exercises").getOrCreate()

spark

In [5]:
#1. Create a new database
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

DataFrame[]

In [6]:
#2. Set current database
spark.sql("USE sales_db")

DataFrame[]

In [7]:
#3. Create product_sales table
spark.sql("""
CREATE TABLE IF NOT EXISTS product_sales (
    ProductID INT,
    ProductName STRING,
    Category STRING,
    Price DOUBLE,
    Quantity INT,
    SaleDate DATE
)
USING PARQUET
""")

DataFrame[]

In [8]:
#4. Insert 5 rows into product_sales
spark.sql("INSERT INTO product_sales VALUES (1, 'Laptop', 'Electronics', 55000, 2, DATE('2024-01-12'))")
spark.sql("INSERT INTO product_sales VALUES (2, 'Mouse', 'Electronics', 800, 5, DATE('2024-01-13'))")
spark.sql("INSERT INTO product_sales VALUES (3, 'Shirt', 'Fashion', 1200, 3, DATE('2024-02-01'))")
spark.sql("INSERT INTO product_sales VALUES (4, 'Book', 'Books', 500, 4, DATE('2024-02-15'))")
spark.sql("INSERT INTO product_sales VALUES (5, 'Chair', 'Furniture', 2000, 1, DATE('2024-03-10'))")

DataFrame[]

In [9]:
#5. Select all records
spark.sql("SELECT * FROM product_sales").show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|        1|     Laptop|Electronics|55000.0|       2|2024-01-12|
|        2|      Mouse|Electronics|  800.0|       5|2024-01-13|
|        5|      Chair|  Furniture| 2000.0|       1|2024-03-10|
|        3|      Shirt|    Fashion| 1200.0|       3|2024-02-01|
|        4|       Book|      Books|  500.0|       4|2024-02-15|
+---------+-----------+-----------+-------+--------+----------+



In [10]:
#6. Products with price > 500
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|        1|     Laptop|Electronics|55000.0|       2|2024-01-12|
|        2|      Mouse|Electronics|  800.0|       5|2024-01-13|
|        5|      Chair|  Furniture| 2000.0|       1|2024-03-10|
|        3|      Shirt|    Fashion| 1200.0|       3|2024-02-01|
+---------+-----------+-----------+-------+--------+----------+



In [11]:
#7. Total sale amount for each product
spark.sql("SELECT ProductName, Price * Quantity AS TotalSale FROM product_sales").show()

+-----------+---------+
|ProductName|TotalSale|
+-----------+---------+
|     Laptop| 110000.0|
|      Mouse|   4000.0|
|      Chair|   2000.0|
|      Shirt|   3600.0|
|       Book|   2000.0|
+-----------+---------+



In [12]:
#8. Count of products sold per category
spark.sql("SELECT Category, SUM(Quantity) AS TotalSold FROM product_sales GROUP BY Category").show()

+-----------+---------+
|   Category|TotalSold|
+-----------+---------+
|Electronics|        7|
|  Furniture|        1|
|    Fashion|        3|
|      Books|        4|
+-----------+---------+



In [13]:
#9. Sort by total sales descending
spark.sql("SELECT ProductName, Price * Quantity AS TotalSale FROM product_sales ORDER BY TotalSale DESC").show()

+-----------+---------+
|ProductName|TotalSale|
+-----------+---------+
|     Laptop| 110000.0|
|      Mouse|   4000.0|
|      Shirt|   3600.0|
|      Chair|   2000.0|
|       Book|   2000.0|
+-----------+---------+



In [14]:
#10. Create dummy DataFrame
from pyspark.sql import Row

temp_data = [
    Row(ProductID=101, ProductName="Tablet", Quantity=1),
    Row(ProductID=102, ProductName="Headphones", Quantity=3)
]

df_temp = spark.createDataFrame(temp_data)

In [15]:
#11. Register as temp view
df_temp.createOrReplaceTempView("temp_orders")

In [16]:
#12. SQL query: quantity > 1
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()

+---------+-----------+--------+
|ProductID|ProductName|Quantity|
+---------+-----------+--------+
|      102| Headphones|       3|
+---------+-----------+--------+



In [17]:
#13. Create global temp view
df_temp.createOrReplaceGlobalTempView("global_orders")

In [18]:
#14. Query from another cell/session
spark.sql("SELECT * FROM global_temp.global_orders").show()

+---------+-----------+--------+
|ProductID|ProductName|Quantity|
+---------+-----------+--------+
|      101|     Tablet|       1|
|      102| Headphones|       3|
+---------+-----------+--------+



In [19]:
#15. Create customer_details table
spark.sql("""
CREATE TABLE IF NOT EXISTS customer_details (
    CustomerID INT,
    Name STRING,
    Gender STRING,
    City STRING,
    SignupDate DATE
)
USING PARQUET
""")

DataFrame[]

In [20]:
#16. Insert 3 records
spark.sql("INSERT INTO customer_details VALUES (1, 'Ali', 'Male', 'Hyderabad', DATE('2022-05-10'))")
spark.sql("INSERT INTO customer_details VALUES (2, 'Neha', 'Female', 'Mumbai', DATE('2023-01-15'))")
spark.sql("INSERT INTO customer_details VALUES (3, 'Ravi', 'Male', 'Delhi', DATE('2021-12-01'))")

DataFrame[]

In [21]:
#17. Join (simulate match with ProductID = CustomerID)
spark.sql("""
SELECT p.ProductID, p.ProductName, c.Name, c.City
FROM product_sales p
JOIN customer_details c
ON p.ProductID = c.CustomerID
""").show()

+---------+-----------+----+---------+
|ProductID|ProductName|Name|     City|
+---------+-----------+----+---------+
|        1|     Laptop| Ali|Hyderabad|
|        2|      Mouse|Neha|   Mumbai|
|        3|      Shirt|Ravi|    Delhi|
+---------+-----------+----+---------+



In [22]:
#18. Customers who bought more than 2 products
spark.sql("""
SELECT c.Name, SUM(p.Quantity) AS TotalPurchased
FROM product_sales p
JOIN customer_details c
ON p.ProductID = c.CustomerID
GROUP BY c.Name
HAVING TotalPurchased > 2
""").show()

+----+--------------+
|Name|TotalPurchased|
+----+--------------+
|Neha|             5|
|Ravi|             3|
+----+--------------+



In [23]:
#19. Create sales_summary view
spark.sql("""
CREATE OR REPLACE VIEW sales_summary AS
SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
FROM product_sales
""")

DataFrame[]

In [24]:
#20. Query sales_summary where Total > 1000
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()

+-----------+-------+--------+--------+
|ProductName|  Price|Quantity|   Total|
+-----------+-------+--------+--------+
|     Laptop|55000.0|       2|110000.0|
|      Mouse|  800.0|       5|  4000.0|
|      Chair| 2000.0|       1|  2000.0|
|      Shirt| 1200.0|       3|  3600.0|
|       Book|  500.0|       4|  2000.0|
+-----------+-------+--------+--------+



In [25]:
#21. Drop the view
spark.sql("DROP VIEW IF EXISTS sales_summary")

DataFrame[]

In [26]:
#22. Drop both tables
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

DataFrame[]

In [27]:
#23. Drop the database
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")

DataFrame[]