In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySparkMasterTaskSet") \
    .getOrCreate()
spark

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [44]:
#1. Data Ingestion & Exploration
# Load CSVs with inferred schema
customers = spark.read.csv('/content/drive/MyDrive/customers.csv', header=True, inferSchema=True)
orders = spark.read.csv('/content/drive/MyDrive/orders.csv', header=True, inferSchema=True)


In [None]:
# List all columns and data types.
customers.printSchema()
orders.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)



In [None]:
# Count total customers and orders
print("Total Customers:", customers.count())
print("Total Orders:", orders.count())


Total Customers: 5
Total Orders: 7


In [None]:
# Show distinct cities
customers.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [None]:
#2. DataFrame Transformations
#Add a column TotalAmount = Price * Quantity
from pyspark.sql.functions import col, year
orders = orders.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [None]:
#Create a new column OrderYear from OrderDate
orders = orders.withColumn("OrderYear", year("OrderDate"))
orders.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [None]:
# Filter orders with TotalAmount > 10000
orders.filter(col("TotalAmount") > 10000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+



In [None]:
# Drop Email column
customers = customers.drop("Email")
customers.show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



In [None]:
#3. Handling Nulls & Conditionals
#Simulate a null in City and fill it with “Unknown”
from pyspark.sql.functions import when, lit
customers_with_null = customers.withColumn("City", when(col("CustomerID") == 103, None).otherwise(col("City")))
customers_filled = customers_with_null.fillna("Unknown", subset=["City"])

In [None]:
#Label customers as “Loyal” if SignupDate is before 2022, else “New”
from pyspark.sql.functions import to_date
customers_filled = customers_filled.withColumn("SignupDate", to_date("SignupDate"))
customers_labeled = customers_filled.withColumn("Loyalty", when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New"))
customers_labeled.show()

+----------+-----+---------+----------+-------+
|CustomerID| Name|     City|SignupDate|Loyalty|
+----------+-----+---------+----------+-------+
|       101|  Ali|   Mumbai|2022-05-10|    New|
|       102| Neha|    Delhi|2023-01-15|    New|
|       103| Ravi|  Unknown|2021-11-01|  Loyal|
|       104|Sneha|Hyderabad|2020-07-22|  Loyal|
|       105| Amit|  Chennai|2023-03-10|    New|
+----------+-----+---------+----------+-------+



In [None]:
#Create OrderType column: "Low" if <5,000, "High" if ≥5,000
orders = orders.withColumn("OrderType", when(col("TotalAmount") < 5000, "Low").otherwise("High"))
orders.select("OrderID", "Product", "TotalAmount", "OrderType").show()


+-------+---------+-----------+---------+
|OrderID|  Product|TotalAmount|OrderType|
+-------+---------+-----------+---------+
|      1|   Laptop|   100000.0|     High|
|      2|    Mouse|     1200.0|      Low|
|      3|   Tablet|    20000.0|     High|
|      4|Bookshelf|     3500.0|      Low|
|      5|    Mixer|     5000.0|     High|
|      6| Notebook|     2500.0|      Low|
|      7|    Phone|    30000.0|     High|
+-------+---------+-----------+---------+



In [None]:
#4. Joins & Aggregations
#Join customers and orders on CustomerID
joined_df = customers_labeled.join(orders, on="CustomerID", how="inner")

In [None]:
# Total orders and revenue per city
from pyspark.sql.functions import sum, count
joined_df.groupBy("City").agg(
    count("OrderID").alias("TotalOrders"),
    sum("TotalAmount").alias("TotalRevenue")
).show()

+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|  Chennai|          1|      2500.0|
|   Mumbai|          2|    101200.0|
|  Unknown|          1|      3500.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
+---------+-----------+------------+



In [None]:
# Show top 3 customers by total spend
joined_df.groupBy("Name").agg(
    sum("TotalAmount").alias("TotalSpend")
).orderBy(col("TotalSpend").desc()).show(3)


+-----+----------+
| Name|TotalSpend|
+-----+----------+
|  Ali|  101200.0|
| Neha|   50000.0|
|Sneha|    5000.0|
+-----+----------+
only showing top 3 rows



In [None]:
#Count how many products each category has sold
orders.groupBy("Category").agg(
    count("Product").alias("ProductsSold")
).show()

+-----------+------------+
|   Category|ProductsSold|
+-----------+------------+
| Stationery|           1|
|Electronics|           4|
|  Furniture|           1|
| Appliances|           1|
+-----------+------------+



In [None]:
#5. Spark SQL Tasks
#Create database sales and switch to it
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.catalog.setCurrentDatabase("sales")

In [None]:
#Save both datasets as tables in the sales database.
customers_labeled.write.mode("overwrite").saveAsTable("customers")
orders.write.mode("overwrite").saveAsTable("orders")

In [None]:
# SQL: Orders from Delhi
spark.sql("""
SELECT o.* FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE c.City = 'Delhi'
""").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+



In [None]:
# SQL: Average order value per category
spark.sql("""
SELECT Category, AVG(TotalAmount) AS AvgValue
FROM orders
GROUP BY Category
""").show()

+-----------+--------+
|   Category|AvgValue|
+-----------+--------+
| Stationery|  2500.0|
|Electronics| 37800.0|
|  Furniture|  3500.0|
| Appliances|  5000.0|
+-----------+--------+



In [None]:
# Create monthly_orders view
spark.sql("""
CREATE OR REPLACE TEMP VIEW monthly_orders AS
SELECT MONTH(OrderDate) AS Month, SUM(TotalAmount) AS MonthlyTotal
FROM orders
GROUP BY MONTH(OrderDate)
""")
spark.sql("SELECT * FROM monthly_orders").show()

+-----+------------+
|Month|MonthlyTotal|
+-----+------------+
|    1|    101200.0|
|    3|     32500.0|
|    2|     28500.0|
+-----+------------+



In [50]:
#6. String & Date Functions
#Mask emails using regex (e.g., a***@gmail.com )
from pyspark.sql.functions import regexp_replace, concat_ws, datediff, current_date, month
# Re-load customers with Email since we dropped email in task 2 (needed for email masking)
customers_with_email = spark.read.csv('/content/drive/MyDrive/customers.csv', header=True, inferSchema=True)

# Mask email using regex
masked_emails = customers_with_email.withColumn("MaskedEmail", regexp_replace("Email", r"(^.).+(@.*)", r"\1***\2"))
masked_emails.select("Name", "Email", "MaskedEmail").show()

+-----+-----------------+-----------+
| Name|            Email|MaskedEmail|
+-----+-----------------+-----------+
|  Ali|    ali@gmail.com|      1***2|
| Neha|   neha@yahoo.com|      1***2|
| Ravi| ravi@hotmail.com|      1***2|
|Sneha|sneha@outlook.com|      1***2|
| Amit|   amit@gmail.com|      1***2|
+-----+-----------------+-----------+



In [51]:
# Concatenate Name and City
customers_named = customers_labeled.withColumn("NameCity", concat_ws(" from ", "Name", "City")).show()


+----------+-----+---------+----------+-------+--------------------+
|CustomerID| Name|     City|SignupDate|Loyalty|            NameCity|
+----------+-----+---------+----------+-------+--------------------+
|       101|  Ali|   Mumbai|2022-05-10|    New|     Ali from Mumbai|
|       102| Neha|    Delhi|2023-01-15|    New|     Neha from Delhi|
|       103| Ravi|  Unknown|2021-11-01|  Loyal|   Ravi from Unknown|
|       104|Sneha|Hyderabad|2020-07-22|  Loyal|Sneha from Hyderabad|
|       105| Amit|  Chennai|2023-03-10|    New|   Amit from Chennai|
+----------+-----+---------+----------+-------+--------------------+



In [52]:
from pyspark.sql.functions import datediff, current_date
#Use datediff() to calculate customer age in days
customers_with_age = customers.withColumn("CustomerAge", datediff(current_date(), col("SignupDate")))
customers_with_age.select("Name", "SignupDate", "CustomerAge").show()

+-----+----------+-----------+
| Name|SignupDate|CustomerAge|
+-----+----------+-----------+
|  Ali|2022-05-10|       1126|
| Neha|2023-01-15|        876|
| Ravi|2021-11-01|       1316|
|Sneha|2020-07-22|       1783|
| Amit|2023-03-10|        822|
+-----+----------+-----------+



In [53]:
#Extract month name from OrderDate
from pyspark.sql.functions import date_format
orders_with_month = orders.withColumn("MonthName", date_format("OrderDate", "MMMM"))
orders_with_month.select("OrderID", "OrderDate", "MonthName").show()

+-------+----------+---------+
|OrderID| OrderDate|MonthName|
+-------+----------+---------+
|      1|2024-01-10|  January|
|      2|2024-01-15|  January|
|      3|2024-02-01| February|
|      4|2024-02-10| February|
|      5|2024-02-15| February|
|      6|2024-03-01|    March|
|      7|2024-03-02|    March|
+-------+----------+---------+



In [54]:
#7. UDFs and Complex Logic
#Write a UDF to tag customers:
#“Gold” if spend >50K, “Silver” if 10K–50K, “Bronze” if <10K
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def customer_tag(amount):
    if amount > 50000:
        return "Gold"
    elif amount >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(customer_tag, StringType())

customer_spending = joined_df.groupBy("CustomerID", "Name").agg(sum("TotalAmount").alias("TotalSpend"))
customer_spending = customer_spending.withColumn("Tag", tag_udf("TotalSpend"))
customer_spending.show()

+----------+-----+----------+------+
|CustomerID| Name|TotalSpend|   Tag|
+----------+-----+----------+------+
|       105| Amit|    2500.0|Bronze|
|       104|Sneha|    5000.0|Bronze|
|       101|  Ali|  101200.0|  Gold|
|       102| Neha|   50000.0|Silver|
|       103| Ravi|    3500.0|Bronze|
+----------+-----+----------+------+



In [55]:
#Write a UDF to shorten product names (first 3 letters + ...)
def short_name(name):
    return name[:3] + "..." if len(name) > 3 else name

short_udf = udf(short_name, StringType())
orders_short = orders.withColumn("ShortProduct", short_udf("Product"))
orders_short.select("Product", "ShortProduct").show()

+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



In [56]:
#8. Parquet & Views
#Save the joined result as a Parquet file
joined_df.write.mode("overwrite").parquet("/content/drive/MyDrive/joined_data.parquet")


In [57]:
# Read back and verify
parquet_df = spark.read.parquet("/content/drive/MyDrive/joined_data.parquet")
parquet_df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- Loyalty: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)



In [58]:
# Create and query a global temp view
parquet_df.createOrReplaceGlobalTempView("global_joined")

spark.sql("SELECT * FROM global_temp.global_joined WHERE TotalAmount > 10000").show()


+----------+----+------+----------+-------+-------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID|Name|  City|SignupDate|Loyalty|OrderID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+----+------+----------+-------+-------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|       101| Ali|Mumbai|2022-05-10|    New|      1| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|       102|Neha| Delhi|2023-01-15|    New|      3| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|       102|Neha| Delhi|2023-01-15|    New|      7|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+----------+----+------+----------+-------+-------+-------+-----------+--------+-------+----------+-----------+---------+---------+



In [59]:
#Compare performance between CSV read and Parquet read
import time

start_csv = time.time()
_ = spark.read.csv('/content/drive/MyDrive/orders.csv', header=True, inferSchema=True).count()
end_csv = time.time()

start_parquet = time.time()
_ = spark.read.parquet("/content/drive/MyDrive/joined_data.parquet").count()
end_parquet = time.time()

print(f"CSV Read Time: {end_csv - start_csv:.4f} seconds")
print(f"Parquet Read Time: {end_parquet - start_parquet:.4f} seconds")

CSV Read Time: 0.8832 seconds
Parquet Read Time: 0.5586 seconds
