In [91]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, year, sum as _sum

spark=SparkSession.builder.appName("June9Assignment1").getOrCreate()
spark

In [92]:
#Load both CSV files with schema inference.
df_spark_customers = spark.read.csv("/content/customers.csv", header=True, inferSchema=True)
df_spark_orders = spark.read.csv("/content/orders.csv", header=True, inferSchema=True)
df_spark_customers.show()
df_spark_orders.show()
#List all columns and data types.
df_spark_customers.printSchema()
df_spark_orders.printSchema()
#Count the total number of customers and orders.
print(df_spark_customers.count())
print(df_spark_orders.count())
#Show distinct cities.
df_spark_customers.select("City").distinct().show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
|       106| Test|   amit@gmail.com|     NULL|2023-03-10|
+----------+-----+-----------------+---------+----------+

+-------+----------+---------+-----------+--------+-------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+---------+-----------+--------+-------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-

In [93]:
#Add a column TotalAmount = Price * Quantity .
df_spark_orders = df_spark_orders.withColumn("TotalAmount", df_spark_orders.Price * df_spark_orders.Quantity)
df_spark_orders.show()
#Create a new column OrderYear from OrderDate .
df_spark_orders = df_spark_orders.withColumn("OrderYear", year(df_spark_orders.OrderDate))
df_spark_orders.show()
#Filter orders with TotalAmount > 10,000 .
df_spark_orders.filter(df_spark_orders.TotalAmount > 10000).show()
#Drop the Email column from customers .
df_spark_customers.drop("Email").show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+

+-------+----------+---------+-----------+--------+-------+----------+-----------+----

In [94]:
#Simulate a null in City and fill it with “Unknown”.
df_spark_customers = df_spark_customers.withColumn("City", when(df_spark_customers.City.isNull(), "Unknown").otherwise(df_spark_customers.City))
df_spark_customers.show()
#Label customers as “Loyal” if SignupDate is before 2022, else “New”.
df_spark_customers = df_spark_customers.withColumn("CustomerType", when(df_spark_customers.SignupDate < "2022-01-01", "Loyal").otherwise("New"))
df_spark_customers.show()
#Create OrderType column: "Low" if < 5,000, "High" if ≥ 5,000.
df_spark_orders = df_spark_orders.withColumn("OrderType", when(df_spark_orders.TotalAmount < 5000, "Low").otherwise("High"))
df_spark_orders.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
|       106| Test|   amit@gmail.com|  Unknown|2023-03-10|
+----------+-----+-----------------+---------+----------+

+----------+-----+-----------------+---------+----------+------------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerType|
+----------+-----+-----------------+---------+----------+------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|         New|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|         New|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|       

In [95]:
#Join customers and orders on CustomerID .
df_spark_customers_orders = df_spark_customers.join(df_spark_orders, "CustomerID", "inner")
df_spark_customers_orders.show()
#Get total orders and revenue per city.
df_spark_customers_orders.groupBy("City").agg({"OrderID": "count", "TotalAmount": "sum"}).show()
#Show top 3 customers by total spend.
df_spark_customers_orders.select("CustomerID", "TotalAmount").groupBy("CustomerID").agg(_sum("TotalAmount").alias("TotalSpent")).orderBy("TotalSpent", ascending=False).show(3)
#Count how many products each category has sold.
df_spark_customers_orders.groupBy("Category").agg({"Quantity": "sum"}).show()

+----------+-----+-----------------+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+-----------------+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|         New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|         New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|         New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|       103| Ravi| ravi@hotmail.co

In [96]:
# Create database sales and switch to it.
spark.sql("create database if not exists sales")
spark.sql("use sales")
# Save both datasets as tables in the sales database.
# Overwrite the 'customers' table in the database
df_spark_customers.write.mode('overwrite').saveAsTable("customers")
df_spark_orders.write.mode('overwrite').saveAsTable("orders")
# Write SQL to:
# List all orders by customers from “Delhi”.
spark.sql("select * from orders where CustomerID in (select CustomerID from customers where City = 'Delhi')").show()
# Find average order value in each category.
spark.sql("select Category, avg(TotalAmount) as AvgOrderValue from orders group by Category").show()
# Create a view monthly_orders with month-wise total amount.
spark.sql("create or replace view monthly_orders as select month(OrderDate) as Month, sum(TotalAmount) as TotalAmount from orders group by month(OrderDate)")
spark.sql("select * from monthly_orders").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+

+-----+-----------+
|Month|TotalAmount|
+-----+-----------+
|    1|   101200.0|
|    3|    32500.0|
|    2|    28500.0|
+-----+-----------+



In [97]:
#Mask emails using regex (e.g a***@gmail.com ).
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
def mask_email(email):
    parts = email.split('@')
    if len(parts) == 2:
        username, domain = parts
        masked_username = username[0] + '*' * (len(username) - 2) + username[-1]
        return f"{masked_username}@{domain}"
    else:
        return email
mask_email_udf = udf(mask_email, StringType())
customers_df = df_spark_customers.withColumn("Email", mask_email_udf(col("Email")))
customers_df.show()
#Concatenate Name and City as “Name from City”.
from pyspark.sql.functions import concat, col, lit
df_spark_customers = df_spark_customers.withColumn("NameFromCity", concat(col("Name"), lit(" from "), col("City")))
df_spark_customers.show()
#Use datediff() to calculate customer age in days.
from pyspark.sql.functions import datediff,current_date
df_spark_customers = df_spark_customers.withColumn("AgeInDays", datediff(current_date(), col("SignupDate")))
df_spark_customers.show()
#Extract month name from OrderDate .
from pyspark.sql.functions import month, date_format
df_spark_orders = df_spark_orders.withColumn("MonthName", date_format(col("OrderDate"), "MMMM"))
df_spark_orders.show()

+----------+-----+-----------------+---------+----------+------------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerType|
+----------+-----+-----------------+---------+----------+------------+
|       101|  Ali|    a*i@gmail.com|   Mumbai|2022-05-10|         New|
|       102| Neha|   n**a@yahoo.com|    Delhi|2023-01-15|         New|
|       103| Ravi| r**i@hotmail.com|Bangalore|2021-11-01|       Loyal|
|       104|Sneha|s***a@outlook.com|Hyderabad|2020-07-22|       Loyal|
|       105| Amit|   a**t@gmail.com|  Chennai|2023-03-10|         New|
|       106| Test|   a**t@gmail.com|  Unknown|2023-03-10|         New|
+----------+-----+-----------------+---------+----------+------------+

+----------+-----+-----------------+---------+----------+------------+--------------------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerType|        NameFromCity|
+----------+-----+-----------------+---------+----------+------------+--------------------+
|       101| 

In [98]:
#Write a UDF to tag customers:
#“Gold” if spend > 50K, “Silver” if 10K–50K, “Bronze” if <10K.
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def tag_customer(total_amount):
    if total_amount > 50000:
        return "Gold"
    elif total_amount >= 10000:
        return "Silver"
    else:
        return "Bronze"
tag_customer_udf = udf(tag_customer, StringType())
df_spark_customers_orders = df_spark_customers_orders.withColumn("CustomerTag", tag_customer_udf(col("TotalAmount")))
df_spark_customers_orders.show()
#Write a UDF to shorten product names (first 3 letters + ...).
def shorten_product_name(product_name):
    return product_name[:3] + "..."
shorten_product_name_udf = udf(shorten_product_name, StringType())
df_spark_customers_orders = df_spark_customers_orders.withColumn("Product", shorten_product_name_udf(col("Product")))
df_spark_customers_orders.show()


+----------+-----+-----------------+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|CustomerTag|
+----------+-----+-----------------+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|         New|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|       Gold|
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|         New|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|     Bronze|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|         New|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    200

In [99]:
#Save the joined result as a Parquet file.
df_spark_customers_orders.write.mode('overwrite').parquet("/content/customers_orders.parquet")
#Read it back and verify schema.
df_spark_customers_orders_parquet = spark.read.parquet("/content/customers_orders.parquet")
df_spark_customers_orders_parquet.printSchema()
#Create and query a global temp view.
df_spark_customers_orders_parquet.createOrReplaceGlobalTempView("customers_orders_parquet")
spark.sql("select * from global_temp.customers_orders_parquet").show()
#Compare performance between CSV read and Parquet read.
df_spark_customers_orders_csv = spark.read.csv("/content/customers.csv", header=True, inferSchema=True)
df_spark_customers_orders_csv.show()
df_spark_customers_orders_csv.printSchema()
df_spark_customers_orders_csv.createOrReplaceGlobalTempView("customers_orders_csv")
spark.sql("select * from global_temp.customers_orders_csv").show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- CustomerTag: string (nullable = true)

+----------+-----+-----------------+---------+----------+------------+-------+-------+-----------+--------+-------+----------+-----------+---------+---------+-----------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerType|OrderID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|CustomerTag|
+----------

In [100]:
#spark.sql("drop database sales")

In [101]:
spark.stop()