# PySpark + Spark SQL Task Sheet
Complete Solution with Comments and Sections

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, year, lit, isnull, concat_ws, regexp_replace, to_date, current_date, datediff, udf
from pyspark.sql.types import StringType

# Create SparkSession with Hive support
spark = SparkSession.builder.appName("PracticeProject").enableHiveSupport().getOrCreate()

# Prepare DataFrames
customers_data = [
    (101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
    (102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
    (103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
    (104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
    (105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]
orders_data = [
    (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
    (2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
    (3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
    (4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
    (5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
    (6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
    (7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]
customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])

# Save as Hive Tables
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

## SECTION A: PySpark DataFrame Tasks

In [None]:
# 1. Add TotalAmount column
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

# 2. Filter orders with TotalAmount > 10000
high_value_orders_df = orders_df.filter(col("TotalAmount") > 10000)

# 3. Standardize City field to lowercase
customers_df = customers_df.withColumn("City", expr("lower(City)"))

# 4. Add OrderYear column
orders_df = orders_df.withColumn("OrderYear", year("OrderDate"))

# 5. Fill null values in Email
customers_df = customers_df.fillna({"Email": "not_provided@example.com"})

# 6. Categorize orders using when/otherwise
orders_df = orders_df.withColumn(
    "CategoryLabel",
    when(col("TotalAmount") < 5000, "Low")
    .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
    .otherwise("High")
)

## SECTION B: Spark SQL Tasks

In [None]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

# 7. Orders by “Ali”
spark.sql("""
SELECT o.* 
FROM orders o 
JOIN customers c ON o.CustomerID = c.CustomerID 
WHERE c.Name = 'Ali'
""").show()

# 8. Total spending per customer
spark.sql("""
SELECT c.Name, SUM(o.TotalAmount) AS TotalSpending
FROM customers c
JOIN orders o ON c.CustomerID = o.CustomerID
GROUP BY c.Name
""").show()

# 9. Category with highest revenue
spark.sql("""
SELECT Category, SUM(TotalAmount) AS Revenue
FROM orders
GROUP BY Category
ORDER BY Revenue DESC
LIMIT 1
""").show()

# 10. Create customer_orders view
spark.sql("""
CREATE OR REPLACE TEMP VIEW customer_orders AS
SELECT c.Name AS CustomerName, o.Product, o.TotalAmount
FROM customers c
JOIN orders o ON c.CustomerID = o.CustomerID
""")

# 11. Query view for orders after Feb 2024
spark.sql("""
SELECT * 
FROM customer_orders co
JOIN orders o ON co.Product = o.Product
WHERE o.OrderDate > '2024-02-29'
""").show()

## SECTION C: Advanced Practice

In [None]:
# 12. Global Temp View
customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("SELECT * FROM global_temp.customers WHERE City = 'mumbai'").show()

# 13. Save orders_df to Parquet
orders_df.write.mode("overwrite").parquet("/tmp/orders_with_amount.parquet")

# 14. Read Parquet and count rows
orders_from_parquet = spark.read.parquet("/tmp/orders_with_amount.parquet")
print("Total Orders in Parquet:", orders_from_parquet.count())

## SECTION D: UDF + Built-in Function Tasks

In [None]:
# 15. Mask email
def mask_email(email):
    try:
        username, domain = email.split("@")
        return username[0] + "***@" + domain
    except:
        return "***@***"

mask_email_udf = udf(mask_email, StringType())
customers_df = customers_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))

# 16. Concat name and city
customers_df = customers_df.withColumn("Label", concat_ws(" from ", col("Name"), col("City")))

# 17. Remove special chars from Product
orders_df = orders_df.withColumn("CleanProduct", regexp_replace(col("Product"), "[^a-zA-Z0-9]", ""))

# 18. Calculate customer age in days
customers_df = customers_df.withColumn("SignupDateFormatted", to_date("SignupDate"))
customers_df = customers_df.withColumn("CustomerAgeDays", datediff(current_date(), col("SignupDateFormatted")))