In [0]:
from pyspark.sql import SparkSession
spark 

In [0]:
#1. Load retail_data.csv into a PySpark DataFrame and display schema

df_retail = spark.read.option("header", "true").option("inferSchema", "true") \
    .csv("file:/Workspace/Shared/retail_data.csv")

df_retail.printSchema()
df_retail.show()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|

In [0]:
#2. Infer schema as False — then manually cast columns
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

manual_schema = StructType([
    StructField("TransactionID", StringType(), True),
    StructField("Customer", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", IntegerType(), True),
    StructField("TotalPrice", IntegerType(), True),
    StructField("TransactionDate", StringType(), True),
    StructField("PaymentMode", StringType(), True)
])

df_manual = spark.read.option("header", "true").schema(manual_schema) \
    .csv("file:/Workspace/Shared/retail_data.csv")

df_manual.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionDate: string (nullable = true)
 |-- PaymentMode: string (nullable = true)



In [0]:
#3. Filter transactions where TotalPrice > 40000
df_retail.filter(df_retail.TotalPrice > 40000).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+



In [0]:
#4. Get unique cities from the dataset

df_retail.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [0]:
#5. Find all transactions from "Delhi" using .filter() and .where()
# Using filter()
df_retail.filter(df_retail.City == "Delhi").show()

# Using where()
df_retail.where(df_retail.City == "Delhi").show()

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1006|   Farah|Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1004|   

In [0]:
#6. Add a column DiscountedPrice = TotalPrice - 10%
from pyspark.sql.functions import col

df_retail = df_retail.withColumn("DiscountedPrice", col("TotalPrice") * 0.9)
df_retail.select("TransactionID", "TotalPrice", "DiscountedPrice").show()


+-------------+----------+---------------+
|TransactionID|TotalPrice|DiscountedPrice|
+-------------+----------+---------------+
|        T1001|     70000|        63000.0|
|        T1002|     60000|        54000.0|
|        T1003|     15000|        13500.0|
|        T1004|     20000|        18000.0|
|        T1005|     50000|        45000.0|
|        T1006|      3000|         2700.0|
+-------------+----------+---------------+



In [0]:
#7. Rename TransactionDate to TxnDate

df_retail = df_retail.withColumnRenamed("TransactionDate", "TxnDate")
df_retail.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TxnDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- DiscountedPrice: double (nullable = true)



In [0]:
#8. Drop the column UnitPrice
df_retail = df_retail.drop("UnitPrice")
df_retail.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TxnDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- DiscountedPrice: double (nullable = true)



In [0]:
#9. Get total sales by city
df_retail.groupBy("City").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

+---------+----------+
|     City|TotalSales|
+---------+----------+
|Bangalore|     60000|
|   Mumbai|    120000|
|    Delhi|     23000|
|Hyderabad|     15000|
+---------+----------+



In [0]:
#10. Get average unit price by category (need original with UnitPrice)

df_full = spark.read.option("header", "true").option("inferSchema", "true") \
    .csv("file:/Workspace/Shared/retail_data.csv")

df_full.groupBy("Category").avg("UnitPrice").withColumnRenamed("avg(UnitPrice)", "AvgUnitPrice").show()


+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|Electronics|     37750.0|
|  Furniture|     10000.0|
+-----------+------------+



In [0]:
#11. Count of transactions grouped by PaymentMode

df_retail.groupBy("PaymentMode").count().show()


+-----------+-----+
|PaymentMode|count|
+-----------+-----+
|Net Banking|    1|
|       Card|    3|
|       Cash|    1|
|        UPI|    1|
+-----------+-----+



In [0]:
#12. Rank transactions by TotalPrice within each City
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

win_spec = Window.partitionBy("City").orderBy(col("TotalPrice").desc())

df_retail.withColumn("Rank", rank().over(win_spec)).select("TransactionID", "City", "TotalPrice", "Rank").show()

+-------------+---------+----------+----+
|TransactionID|     City|TotalPrice|Rank|
+-------------+---------+----------+----+
|        T1002|Bangalore|     60000|   1|
|        T1004|    Delhi|     20000|   1|
|        T1006|    Delhi|      3000|   2|
|        T1003|Hyderabad|     15000|   1|
|        T1001|   Mumbai|     70000|   1|
|        T1005|   Mumbai|     50000|   2|
+-------------+---------+----------+----+



In [0]:
#13. Use lag() to get previous transaction amount per city

from pyspark.sql.functions import lag

win_spec_lag = Window.partitionBy("City").orderBy("TxnDate")

df_retail.withColumn("PrevAmount", lag("TotalPrice").over(win_spec_lag)) \
    .select("TransactionID", "City", "TotalPrice", "PrevAmount").show()

+-------------+---------+----------+----------+
|TransactionID|     City|TotalPrice|PrevAmount|
+-------------+---------+----------+----------+
|        T1002|Bangalore|     60000|      NULL|
|        T1004|    Delhi|     20000|      NULL|
|        T1006|    Delhi|      3000|     20000|
|        T1003|Hyderabad|     15000|      NULL|
|        T1001|   Mumbai|     70000|      NULL|
|        T1005|   Mumbai|     50000|     70000|
+-------------+---------+----------+----------+



In [0]:
#14. Create a second DataFrame: city_region
from pyspark.sql import Row

city_region = spark.createDataFrame([
    Row(City="Mumbai", Region="West"),
    Row(City="Delhi", Region="North"),
    Row(City="Bangalore", Region="South"),
    Row(City="Hyderabad", Region="South")
])
city_region.show()

+---------+------+
|     City|Region|
+---------+------+
|   Mumbai|  West|
|    Delhi| North|
|Bangalore| South|
|Hyderabad| South|
+---------+------+



In [0]:
#15. Join and get total sales by Region

df_joined = df_retail.join(city_region, on="City", how="left")
df_joined.groupBy("Region").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

+------+----------+
|Region|TotalSales|
+------+----------+
| South|     75000|
|  West|    120000|
| North|     23000|
+------+----------+



In [0]:
#16. Introduce nulls and replace them
# Introduce nulls (simulate for demo)
from pyspark.sql.functions import when

df_nulls = df_retail.withColumn("Quantity", when(df_retail.TransactionID == "T1002", None).otherwise(df_retail.Quantity))

# Replace with default values
df_cleaned = df_nulls.fillna({"Quantity": 1})
df_cleaned.show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|2024-01-15|       Card|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       1|     60000|2024-01-20|        UPI|        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|       Card|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|     50000|2024-02-15|       Card|        45000.0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|2024-02-18|   

In [0]:
#17. Drop rows where Quantity is null
df_cleaned = df_nulls.na.drop(subset=["Quantity"])
df_cleaned.show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|2024-01-15|       Card|        63000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|       Card|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|     50000|2024-02-15|       Card|        45000.0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|2024-02-18|       Cash|         2700.0|
+-------------+--------+---------+-------+-----------+--------+----------+----------+---

In [0]:
#18. Fill null PaymentMode with "Unknown"

df_filled = df_cleaned.fillna({"PaymentMode": "Unknown"})
df_filled.select("TransactionID", "PaymentMode").show()

+-------------+-----------+
|TransactionID|PaymentMode|
+-------------+-----------+
|        T1001|       Card|
|        T1003|Net Banking|
|        T1004|       Card|
|        T1005|       Card|
|        T1006|       Cash|
+-------------+-----------+



In [0]:
#19. Label orders using UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def label_order(amount):
    if amount > 50000:
        return "High"
    elif amount >= 30000:
        return "Medium"
    else:
        return "Low"

label_udf = udf(label_order, StringType())

df_labeled = df_filled.withColumn("OrderLabel", label_udf(col("TotalPrice")))
df_labeled.select("TransactionID", "TotalPrice", "OrderLabel").show()

+-------------+----------+----------+
|TransactionID|TotalPrice|OrderLabel|
+-------------+----------+----------+
|        T1001|     70000|      High|
|        T1003|     15000|       Low|
|        T1004|     20000|       Low|
|        T1005|     50000|    Medium|
|        T1006|      3000|       Low|
+-------------+----------+----------+



In [0]:
#20. Extract year, month, and day from TxnDate

from pyspark.sql.functions import year, month, dayofmonth, to_date

df_dates = df_labeled.withColumn("TxnDate", to_date("TxnDate"))
df_dates = df_dates.withColumn("Year", year("TxnDate")) \
                   .withColumn("Month", month("TxnDate")) \
                   .withColumn("Day", dayofmonth("TxnDate"))
df_dates.select("TransactionID", "TxnDate", "Year", "Month", "Day").show()

+-------------+----------+----+-----+---+
|TransactionID|   TxnDate|Year|Month|Day|
+-------------+----------+----+-----+---+
|        T1001|2024-01-15|2024|    1| 15|
|        T1003|2024-02-10|2024|    2| 10|
|        T1004|2024-02-12|2024|    2| 12|
|        T1005|2024-02-15|2024|    2| 15|
|        T1006|2024-02-18|2024|    2| 18|
+-------------+----------+----+-----+---+



In [0]:
#Q21. Filter transactions in February
df_dates.filter(month("TxnDate") == 2).show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|       Low|2024|    2| 10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|       Card|        18000.0|       Low|2024|    2| 12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|     50000|2024-02-15|       Card|        45000.0|    Medium|2024|    2| 15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|2024-02-18|       Cash|         2700.0|       Low|2024|    2| 18|
+-------------+-----

In [0]:
#22. Duplicate using union() and remove duplicates
df_union = df_dates.union(df_dates)
df_distinct = df_union.dropDuplicates()
df_distinct.show()


+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|2024-02-18|       Cash|         2700.0|       Low|2024|    2| 18|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|       Card|        18000.0|       Low|2024|    2| 12|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|2024-01-15|       Card|        63000.0|      High|2024|    1| 15|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|       Low|2024|    2| 10|
|        T1005|   Ka