In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName("PracticeProject").enableHiveSupport().getOrCreate()

customers_data = [ (101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
(102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
(103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
(104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
(105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'), ]

orders_data = [ (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
(2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
(3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
(4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
(5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
(6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
(7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'), ]

customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])
spark.sql("create database sales")
spark.sql("use sales")
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

**SECTION A: PySpark DataFrame Tasks**

In [4]:
#1. Add a column TotalAmount = Price * Quantity to the order_df
from pyspark.sql.functions import col
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df.show()

#2. Filter all orders with TotalAmount > 10000 .
a= orders_df.filter(col("TotalAmount") > 10000)
a.show()

#3. Standardize the City field in orders_df . customers_df (e.g., lowercase).
from pyspark.sql.functions import lower
customers_df = customers_df.withColumn("City", lower(col("City")))
customers_df.show()

#4. Extract year from OrderDate and add a new column OrderYear .
from pyspark.sql.functions import year
order_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))
order_df.show()

#5. Fill null values in any column of your choice with defaults.
b = orders_df.fillna({"Quantity": 0})
b.show()

#6. Use when/otherwise to categorize orders: <5000 : "Low" 5000-20000 : "Medium" >20000 : "High"
from pyspark.sql.functions import when
c = orders_df.withColumn("OrderCategory",
when(col("TotalAmount") < 5000, "Low")
.when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
.otherwise("High"))
c.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|Orde

**SECTION B: Spark SQL Tasks**

In [5]:
#7. Run a SQL query to list all orders made by “Ali”.
spark.sql("""
select o.* FROM sales.orders o
join sales.customers c on o.CustomerID = c.CustomerID
where c.Name = 'Ali'""").show()

#8. Get total spending by each customer using SQL.
spark.sql("""
select c.CustomerID, c.Name, SUM(o.Price * o.Quantity) as TotalSpending
from sales.orders o
join sales.customers c on o.CustomerID = c.CustomerID
group by c.CustomerID, c.Name
order by TotalSpending desc""").show()

#9. Find out which category made the highest total revenue.
spark.sql("""
select Category, SUM(Price * Quantity) as TotalRevenue
from sales.orders
group by Category
order by TotalRevenue desc limit 1""").show()

#10. Create a view customer_orders showing CustomerName, Product, TotalAmount .
spark.sql("""
create or replace view customer_orders as
select c.Name as CustomerName, o.Product, (o.Price * o.Quantity) as TotalAmount
from sales.orders o
join sales.customers c on o.CustomerID = c.CustomerID""")

#11. Query the view for products ordered after Feb 2024.
spark.sql("""
select * from customer_orders co
join sales.orders o on co.Product = o.Product
where o.OrderDate >= '2024-03-01'""").show()

+-------+----------+-------+-----------+--------+-------+----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+-------+-----------+--------+-------+----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|
+-------+----------+-------+-----------+--------+-------+----------+

+----------+-----+-------------+
|CustomerID| Name|TotalSpending|
+----------+-----+-------------+
|       101|  Ali|     101200.0|
|       102| Neha|      50000.0|
|       104|Sneha|       5000.0|
|       103| Ravi|       3500.0|
|       105| Amit|       2500.0|
+----------+-----+-------------+

+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|    151200.0|
+-----------+------------+

+------------+--------+-----------+-------+----------+--------+-----------+--------+-------+----------+
|CustomerName| Product|TotalAmount|OrderID|Cust

**SECTION C: Advanced Practice**

In [12]:
#12. Create a Global Temp View from customers_df , then query it using (SELECT * FROM global_temp.customers WHERE City = 'Mumbai'; ):
customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("select * from global_temp.customers where City = 'mumbai';").show()

#13. Save the transformed orders_df (with TotalAmount) to a Parquet file.
orders_df.write.parquet("orders_with_total.parquet", mode="overwrite")

#14. Read back the Parquet file and count how many orders are in it.
parquet_df = spark.read.parquet("orders_with_total.parquet")
print(f"Total orders: {parquet_df.count()}")

+----------+----+-------------+------+----------+--------------+---------------+
|CustomerID|Name|        Email|  City|SignupDate|   MaskedEmail|CustomerAgeDays|
+----------+----+-------------+------+----------+--------------+---------------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|a***@gmail.com|           1121|
+----------+----+-------------+------+----------+--------------+---------------+

Total orders: 7


SECTION D: UDF + Built-in Function Tasks

In [13]:
#15. Write a UDF that masks emails like: ali@gmail.com → a***@gmail.com .
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def mask_email(email):
    if '@' in email:
        name, domain = email.split('@')
        return f"{name[0]}***@{domain}"
    return email
masked = udf(mask_email, StringType())
customers_df = customers_df.withColumn("MaskedEmail", masked(col("Email")))
customers_df.select("CustomerID", "Name", "MaskedEmail", "City", "SignupDate").show()

#16. Use concat_ws() to create a full label like: 17. Use 'Ali from Mumbai'
from pyspark.sql.functions import concat_ws, lit
cust_df = customers_df.withColumn("FullLabel",
    concat_ws(" ", col("Name"), lit("from"), col("City")))
cust_df.show()

#17. Use regexp_replace() to remove special characters from product names
from pyspark.sql.functions import regexp_replace
orders_df = orders_df.withColumn("CleanProduct",
    regexp_replace(col("Product"), "[^a-zA-Z0-9]", ""))
orders_df.show()

#18. Use to_date() and datediff() to calculate customer age in days (from SignupDate to today).
from pyspark.sql.functions import datediff, current_date, to_date
customers_df = customers_df.withColumn("SignupDate", to_date(col("SignupDate")))
customers_df = customers_df.withColumn("CustomerAgeDays",
    datediff(current_date(), col("SignupDate")))
customers_df.show()

+----------+-----+----------------+---------+----------+
|CustomerID| Name|     MaskedEmail|     City|SignupDate|
+----------+-----+----------------+---------+----------+
|       101|  Ali|  a***@gmail.com|   mumbai|2022-05-10|
|       102| Neha|  n***@yahoo.com|    delhi|2023-01-15|
|       103| Ravi|r***@hotmail.com|bangalore|2021-11-01|
|       104|Sneha|s***@outlook.com|hyderabad|2020-07-22|
|       105| Amit|  a***@gmail.com|  chennai|2023-03-10|
+----------+-----+----------------+---------+----------+

+----------+-----+-----------------+---------+----------+----------------+---------------+--------------------+
|CustomerID| Name|            Email|     City|SignupDate|     MaskedEmail|CustomerAgeDays|           FullLabel|
+----------+-----+-----------------+---------+----------+----------------+---------------+--------------------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|  a***@gmail.com|           1121|     Ali from mumbai|
|       102| Neha|   neha@yahoo.com|  