In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark =SparkSession.builder.appName("PracticeProject").enableHiveSupport().getOrCreate()
spark

In [17]:
# Customers Data
customers_data = [
(101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
(102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
(103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
(104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
(105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]

In [18]:
orders_data = [
(1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
(2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
(3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
(4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
(5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
(6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
(7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]

In [19]:
customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email",
"City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product",
"Category", "Quantity", "Price", "OrderDate"])

In [20]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

In [22]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")


In [23]:
spark.sql("SELECT * FROM customers").show()
spark.sql("SELECT * FROM orders").show()


+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+-------+----------+---------+-----------+--------+-------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+---------+-----------+--------+-------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|
|      4|       103|Bookshelf|  Furniture|       1|

In [36]:
#SECTION A: PySpark DataFrame Tasks
# 1.Add TotalAmount column
from pyspark.sql.functions import col

orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df.select("OrderID", "Product", "Quantity", "Price", "TotalAmount").show()

+-------+---------+--------+-------+-----------+
|OrderID|  Product|Quantity|  Price|TotalAmount|
+-------+---------+--------+-------+-----------+
|      1|   Laptop|       2|50000.0|   100000.0|
|      2|    Mouse|       1| 1200.0|     1200.0|
|      3|   Tablet|       1|20000.0|    20000.0|
|      4|Bookshelf|       1| 3500.0|     3500.0|
|      5|    Mixer|       1| 5000.0|     5000.0|
|      6| Notebook|       5|  500.0|     2500.0|
|      7|    Phone|       1|30000.0|    30000.0|
+-------+---------+--------+-------+-----------+



In [35]:
# 2.Filter orders with TotalAmount > 10000
orders_df.filter(col("TotalAmount") > 10000).show()


+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+--------------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|AmountCategory|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+--------------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|          High|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|        Medium|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|          High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+--------------+



In [26]:
# 3.Standardize City field (to lowercase)
from pyspark.sql.functions import lower

customers_df = customers_df.withColumn("City", lower(col("City")))
customers_df.select("CustomerID", "Name", "City").show()


+----------+-----+---------+
|CustomerID| Name|     City|
+----------+-----+---------+
|       101|  Ali|   mumbai|
|       102| Neha|    delhi|
|       103| Ravi|bangalore|
|       104|Sneha|hyderabad|
|       105| Amit|  chennai|
+----------+-----+---------+



In [27]:
# 4.Add OrderYear column
from pyspark.sql.functions import year

orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))
orders_df.select("OrderID", "OrderDate", "OrderYear").show()


+-------+----------+---------+
|OrderID| OrderDate|OrderYear|
+-------+----------+---------+
|      1|2024-01-10|     2024|
|      2|2024-01-15|     2024|
|      3|2024-02-01|     2024|
|      4|2024-02-10|     2024|
|      5|2024-02-15|     2024|
|      6|2024-03-01|     2024|
|      7|2024-03-02|     2024|
+-------+----------+---------+



In [31]:
# 5.Fill null values in Email with a default value
customers_df = customers_df.fillna({"Email": "not_provided@example.com"})
customers_df.select("Name", "Email").show()


+-----+-----------------+
| Name|            Email|
+-----+-----------------+
|  Ali|    ali@gmail.com|
| Neha|   neha@yahoo.com|
| Ravi| ravi@hotmail.com|
|Sneha|sneha@outlook.com|
| Amit|   amit@gmail.com|
+-----+-----------------+



In [32]:
# 6.Categorize orders as Low, Medium, High
from pyspark.sql.functions import when

orders_df = orders_df.withColumn(
    "AmountCategory",
    when(col("TotalAmount") < 5000, "Low")
    .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
    .otherwise("High")
)
orders_df.select("OrderID", "TotalAmount", "AmountCategory").show()


+-------+-----------+--------------+
|OrderID|TotalAmount|AmountCategory|
+-------+-----------+--------------+
|      1|   100000.0|          High|
|      2|     1200.0|           Low|
|      3|    20000.0|        Medium|
|      4|     3500.0|           Low|
|      5|     5000.0|        Medium|
|      6|     2500.0|           Low|
|      7|    30000.0|          High|
+-------+-----------+--------------+



In [33]:
#SECTION B: Spark SQL Tasks
# 7.Orders by Ali
spark.sql("""
SELECT o.*
FROM customers c
JOIN orders o ON c.CustomerID = o.CustomerID
WHERE c.Name = 'Ali'
""").show()


+-------+----------+-------+-----------+--------+-------+----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+-------+-----------+--------+-------+----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|
+-------+----------+-------+-----------+--------+-------+----------+



In [40]:
# 8.Total spending by each customer
spark.sql("""
SELECT c.Name, SUM(o.TotalAmount) AS TotalSpending
FROM customers c
JOIN orders o ON c.CustomerID = o.CustomerID
GROUP BY c.Name
""").show()


+-----+-------------+
| Name|TotalSpending|
+-----+-------------+
| Ravi|       3500.0|
|Sneha|       5000.0|
| Amit|       2500.0|
| Neha|      50000.0|
|  Ali|     101200.0|
+-----+-------------+



In [41]:
# 9.Category with highest total revenue
spark.sql("""
SELECT Category, SUM(TotalAmount) AS Revenue
FROM orders
GROUP BY Category
ORDER BY Revenue DESC
LIMIT 1
""").show()


+-----------+--------+
|   Category| Revenue|
+-----------+--------+
|Electronics|151200.0|
+-----------+--------+



In [42]:
# 10.Create view customer_orders
spark.sql("""
CREATE OR REPLACE TEMP VIEW customer_orders AS
SELECT c.Name AS CustomerName, o.Product, o.TotalAmount
FROM customers c
JOIN orders o ON c.CustomerID = o.CustomerID
""")


DataFrame[]

In [43]:
# 11.Products ordered after Feb 2024
spark.sql("""
SELECT *
FROM customer_orders
WHERE Product IN (
    SELECT Product FROM orders WHERE OrderDate > '2024-02-01'
)
""").show()


+------------+---------+-----------+
|CustomerName|  Product|TotalAmount|
+------------+---------+-----------+
|        Neha|    Phone|    30000.0|
|        Ravi|Bookshelf|     3500.0|
|       Sneha|    Mixer|     5000.0|
|        Amit| Notebook|     2500.0|
+------------+---------+-----------+



In [44]:
#SECTION C: Advanced Practice
# 12.Global Temp View + Query
customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("SELECT * FROM global_temp.customers WHERE City = 'mumbai'").show()


+----------+----+-------------+------+----------+
|CustomerID|Name|        Email|  City|SignupDate|
+----------+----+-------------+------+----------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|
+----------+----+-------------+------+----------+



In [45]:
# 13.Save to Parquet
orders_df.write.mode("overwrite").parquet("/content/orders_parquet")


In [46]:
# 14.Read Parquet & count
orders_parquet = spark.read.parquet("/content/orders_parquet")
print("Total Orders:", orders_parquet.count())


Total Orders: 7


In [48]:
#SECTION D: UDF + Built-in Function Tasks
# 15.Mask email using UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def mask_email(email):
    parts = email.split("@")
    return parts[0][0] + "***@" + parts[1] if len(parts) == 2 else email

mask_email_udf = udf(mask_email, StringType())

customers_df = customers_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))
customers_df.select("Email", "MaskedEmail").show()


+-----------------+----------------+
|            Email|     MaskedEmail|
+-----------------+----------------+
|    ali@gmail.com|  a***@gmail.com|
|   neha@yahoo.com|  n***@yahoo.com|
| ravi@hotmail.com|r***@hotmail.com|
|sneha@outlook.com|s***@outlook.com|
|   amit@gmail.com|  a***@gmail.com|
+-----------------+----------------+



In [49]:
# 16.Create label using concat_ws
from pyspark.sql.functions import concat_ws

customers_df = customers_df.withColumn("Label", concat_ws(" from ", col("Name"), col("City")))
customers_df.select("Label").show()


+--------------------+
|               Label|
+--------------------+
|     Ali from mumbai|
|     Neha from delhi|
| Ravi from bangalore|
|Sneha from hyderabad|
|   Amit from chennai|
+--------------------+



In [50]:
# 17.Clean Product names
from pyspark.sql.functions import regexp_replace

orders_df = orders_df.withColumn("CleanProduct", regexp_replace("Product", "[^a-zA-Z0-9 ]", ""))
orders_df.select("Product", "CleanProduct").show()


+---------+------------+
|  Product|CleanProduct|
+---------+------------+
|   Laptop|      Laptop|
|    Mouse|       Mouse|
|   Tablet|      Tablet|
|Bookshelf|   Bookshelf|
|    Mixer|       Mixer|
| Notebook|    Notebook|
|    Phone|       Phone|
+---------+------------+



In [51]:
# 18.Days since signup
from pyspark.sql.functions import to_date, datediff, lit
from datetime import date

today_str = date.today().isoformat()
customers_df = customers_df.withColumn("SignupDate", to_date("SignupDate"))
customers_df = customers_df.withColumn("DaysSinceSignup", datediff(lit(today_str), col("SignupDate")))
customers_df.select("Name", "SignupDate", "DaysSinceSignup").show()


+-----+----------+---------------+
| Name|SignupDate|DaysSinceSignup|
+-----+----------+---------------+
|  Ali|2022-05-10|           1121|
| Neha|2023-01-15|            871|
| Ravi|2021-11-01|           1311|
|Sneha|2020-07-22|           1778|
| Amit|2023-03-10|            817|
+-----+----------+---------------+

