In [65]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
spark = SparkSession.builder \
.appName("PySparkAssessment") \
.getOrCreate()

**Task 1: Data Ingestion & Exploration**

In [4]:
# Load datasets with schema inference
customers = spark.read.option("header", True).option("inferSchema", True).csv("customers.csv")
orders = spark.read.option("header", True).option("inferSchema", True).csv("orders.csv")

# List all columns and data types
print("Customers schema:")
customers.printSchema()
print("\nOrders schema:")
orders.printSchema()

# Count total number of customers and orders
print(f"Total customers: {customers.count()}")
print(f"Total orders: {orders.count()}")

# Show distinct cities
print("\nDistinct cities:")
customers.select("City").distinct().show()

Customers schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)


Orders schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)

Total customers: 5
Total orders: 7

Distinct cities:
+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



**Task 2: DataFrame Transformations**

In [8]:
# Add TotalAmount column
orders = orders.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders.select("OrderID", "TotalAmount").show()

# Create OrderYear column
orders = orders.withColumn("OrderYear", year(col("OrderDate")))
orders.select("OrderYear").show()

# Filter orders with TotalAmount > 10,000
high = orders.filter(col("TotalAmount") > 10000)
high.show()

# Drop Email column from customers
customers = customers.drop("Email")
customers.show()

+-------+-----------+
|OrderID|TotalAmount|
+-------+-----------+
|      1|   100000.0|
|      2|     1200.0|
|      3|    20000.0|
|      4|     3500.0|
|      5|     5000.0|
|      6|     2500.0|
|      7|    30000.0|
+-------+-----------+

+---------+
|OrderYear|
+---------+
|     2024|
|     2024|
|     2024|
|     2024|
|     2024|
|     2024|
|     2024|
+---------+

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+----------

**Task 3: Handling Nulls & Conditionals**

In [12]:
# Simulate a null in City and fill it with "Unknown"
customers = customers.withColumn("City", when(col("CustomerID") == 102, None).otherwise(col("City")))
customers = customers.na.fill({"City": "Unknown"})
customers.select("CustomerID", "Name", "City").show()

# Label customers as "Loyal" or "New"
customers = customers.withColumn("CustomerType",when(year(col("SignupDate")) < 2022, "Loyal").otherwise("New"))
customers.select("customerid", "name", "signupdate","customertype").show()

# Create OrderType column
orders = orders.withColumn("OrderType",
when(col("TotalAmount")<5000, "Low")
.otherwise("High"))
orders.select("OrderID", "TotalAmount", "OrderType").show()

+----------+-----+---------+
|CustomerID| Name|     City|
+----------+-----+---------+
|       101|  All|   Mumbai|
|       102| Neha|  Unknown|
|       103| Ravi|Bangalore|
|       104|Sneha|Hyderabad|
|       105| Amit|  Chennai|
+----------+-----+---------+

+----------+-----+----------+------------+
|customerid| name|signupdate|customertype|
+----------+-----+----------+------------+
|       101|  All|2022-05-10|         New|
|       102| Neha|2023-01-15|         New|
|       103| Ravi|2021-11-01|       Loyal|
|       104|Sneha|2020-07-22|       Loyal|
|       105| Amit|2023-03-10|         New|
+----------+-----+----------+------------+

+-------+-----------+---------+
|OrderID|TotalAmount|OrderType|
+-------+-----------+---------+
|      1|   100000.0|     High|
|      2|     1200.0|      Low|
|      3|    20000.0|     High|
|      4|     3500.0|      Low|
|      5|     5000.0|     High|
|      6|     2500.0|      Low|
|      7|    30000.0|     High|
+-------+-----------+---------

**Task 4: Joins & Aggregations**

In [14]:
# Join customers and orders
customer_orders = customers.join(orders, "CustomerID")
customer_orders.show()

# Get total orders and revenue per city
city_stats = customer_orders.groupBy("City") \
.agg(count("OrderID").alias("TotalOrders"),
sum("TotalAmount").alias("TotalRevenue")) \
.orderBy("TotalRevenue", ascending=False)
city_stats.show()

# Show top 3 customers by total spend
top_customers = customer_orders.groupBy("CustomerID", "Name") \
.agg(sum("TotalAmount").alias("TotalSpend")) \
.orderBy("TotalSpend", ascending=False) \
.limit(3)
top_customers.show()

# Count products sold per category
category_stats = orders.groupBy("Category") \
    .agg(sum("Quantity").alias("ProductsSold")) \
    .orderBy("ProductsSold", ascending=False)
category_stats.show()

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|       101|  All|   Mumbai|2022-05-10|         New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|       101|  All|   Mumbai|2022-05-10|         New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|
|       102| Neha|  Unknown|2023-01-15|         New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|       103| Ravi|Bangalore|2021-11-01|       Loyal|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low|

**Task 5: Spark SQL Tasks**

In [59]:
# Create sales database and switch to it
spark.sql("create database sales")
spark.catalog.setCurrentDatabase("sales")

# Save datasets as tables
customers.write.mode("overwrite").saveAsTable("customers")
orders.write.mode("overwrite").saveAsTable("orders")

# SQL queries
# List all orders by customers from "Delhi"
spark.sql("""select o.*
from orders o
join customers c on o.CustomerID = c.CustomerID
where c.City = 'Delhi'""").show()

# Find average order value in each category
spark.sql("""select Category, AVG(TotalAmount) as AvgOrderValue
from orders
group by Category""").show()

# Create view with month-wise total amount
spark.sql("""create or replace view monthly_orders AS
select date_format(OrderDate, 'yyyy-MM') as Month, sum(TotalAmount) as MonthlyTotal
from orders
group by date_format(OrderDate, 'yyyy-MM')""")
spark.sql("SELECT * FROM monthly_orders ORDER BY Month").show()


+-------+----------+-------+-----------+--------+------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity| Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+------+----------+-----------+---------+---------+
|      8|       106| Laptop|Electronics|       2|2500.0|2024-06-09|     5000.0|     2024|      Low|
+-------+----------+-------+-----------+--------+------+----------+-----------+---------+---------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      31240.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+

+-------+------------+
|  Month|MonthlyTotal|
+-------+------------+
|2024-01|    101200.0|
|2024-02|     28500.0|
|2024-03|     32500.0|
|2024-06|      5000.0|
+-------+------------+



**Task 6: String & Date Functions**

In [62]:
# Mask emails (assuming we reload original customers with email)
email = spark.read.option("header", True).option("inferSchema", True).csv("customers.csv")
masked = email.withColumn("Email",
regexp_replace(col("Email"), "(?<=.).(?=.*@)", "*"))
masked.show()

# Concatenate Name and City
customers = customers.withColumn("NameCity",
concat(col("Name"), lit(" from "), col("City")))
customers.select("customerid","Name","NameCity").show()

# Calculate customer age in days
customers = customers.withColumn("CustomerAgeDays",
    datediff(current_date(), col("SignupDate")))
customers.select("customerid","Name","customeragedays").show()

# Extract month name from OrderDate
orders = orders.withColumn("OrderMonth",
date_format(col("OrderDate"), "MMMM"))
orders.select("OrderID","OrderDate","OrderMonth").show()


+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  All|    a**@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   n***@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| r***@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|s****@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   a***@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+----------+-----+--------------------+
|customerid| Name|            NameCity|
+----------+-----+--------------------+
|       101|  All|     All from Mumbai|
|       102| Neha|   Neha from Unknown|
|       103| Ravi| Ravi from Bangalore|
|       104|Sneha|Sneha from Hyderabad|
|       105| Amit|   Amit from Chennai|
+----------+-----+--------------------+

+----------+-----+---------------+
|customerid| Name|customeragedays|
+----------+-----+---------------+
|       101

**Task 7: UDFs and Complex Logic**

In [64]:
# UDF to tag customers by spend
def customer_tag(total_spend):
    if total_spend < 10000:
        return "Bronze"
    elif total_spend >= 10000 and total_spend <= 50000:
        return "Silver"
    else:
        return "Gold"

customer_tag_udf = udf(customer_tag, StringType())
customer_spend = customer_orders.groupBy("CustomerID") \
    .agg(sum("TotalAmount").alias("TotalSpend"))
customer_spend = customer_spend.withColumn("CustomerTag", customer_tag_udf(col("TotalSpend")))
customer_spend.show()

# UDF to shorten product names
def shorten_product(name):
    return name[:3] + "..."

shorten_product_udf = udf(shorten_product, StringType())
orders = orders.withColumn("ShortProduct", shorten_product_udf(col("Product")))
orders.select("OrderID", "Product", "ShortProduct", "TotalAmount").show()

+----------+----------+-----------+
|CustomerID|TotalSpend|CustomerTag|
+----------+----------+-----------+
|       101|  101200.0|       Gold|
|       103|    3500.0|     Bronze|
|       102|   50000.0|     Silver|
|       105|    2500.0|     Bronze|
|       104|    5000.0|     Bronze|
+----------+----------+-----------+

+-------+---------+------------+-----------+
|OrderID|  Product|ShortProduct|TotalAmount|
+-------+---------+------------+-----------+
|      1|   Laptop|      Lap...|   100000.0|
|      2|    Mouse|      Mou...|     1200.0|
|      3|   Tablet|      Tab...|    20000.0|
|      4|Bookshelf|      Boo...|     3500.0|
|      5|    Mixer|      Mix...|     5000.0|
|      6| Notebook|      Not...|     2500.0|
|      7|    Phone|      Pho...|    30000.0|
+-------+---------+------------+-----------+



**Task 8: Parquet & Views**

In [66]:
# Save joined result as Parquet
customer_orders.write.mode("overwrite").parquet("customer_orders.parquet")

# Read back and verify schema
parquet_df = spark.read.parquet("customer_orders.parquet")
print("\nParquet file schema:")
parquet_df.printSchema()

# Create and query global temp view
customer_orders.createGlobalTempView("global_customer_orders")
spark.sql("SELECT * FROM global_temp.global_customer_orders LIMIT 5").show()

# Performance comparison
start_time = time.time()
csv_df = spark.read.option("header", True).csv("orders.csv")
csv_df.count()
print(f"CSV read time: {time.time() - start_time} seconds")

start_time = time.time()
parquet_df = spark.read.parquet("customer_orders.parquet")
parquet_df.count()
print(f"Parquet read time: {time.time() - start_time} seconds")


Parquet file schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---