In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Dataset

In [28]:
raw_sales = [
("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]

# Transformations


1.   Convert transaction_date into DateType
2.   Convert amount to integer, Handle invalid, empty, and null amount values



In [29]:
from datetime import datetime

def parse_date(date_str):
    if not date_str or date_str.strip().lower() == "invalid_date":
        return None
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%Y/%m/%d", "%d-%m-%Y"):
        try:
            return datetime.strptime(date_str.strip(), fmt).strftime("%Y-%m-%d")
        except ValueError:
            continue
    return None

In [30]:
def parse_price(price_str):
    try:
        return int(price_str)
    except (TypeError, ValueError):
        return None

In [31]:
preprocessed = []
for row in raw_sales:
    order_id, city, device, typeofDevice, amount, orederDate, status = row
    preprocessed.append((
        order_id,
        city,
        device,
        typeofDevice,
        parse_price(amount),
        parse_date(orederDate),
        status
    ))

# Convert to Spark DF

In [32]:
columns=["order_id", "city", "device", "typeDevice", "price", "orederDate", "status"]

In [34]:
df = spark.createDataFrame(preprocessed, columns)

In [35]:
df.show()

+--------+---------+-------+-------------+-----+----------+---------+
|order_id|     city| device|   typeDevice|price|orederDate|   status|
+--------+---------+-------+-------------+-----+----------+---------+
|  TXN001|   Delhi | Laptop|  Electronics|45000|2024-01-05|Completed|
|  TXN002|   Mumbai|Mobile |  electronics|32000|2024-01-05|Completed|
|  TXN003|Bangalore| Tablet| Electronics |30000|2024-01-06|Completed|
|  TXN004|    Delhi| Laptop|  Electronics| NULL|2024-01-07|Cancelled|
|  TXN005|  Chennai| Mobile|  Electronics| NULL|2024-01-08|Completed|
|  TXN006|   Mumbai| Tablet|  Electronics| NULL|2024-01-08|Completed|
|  TXN007|    Delhi| Laptop|  electronics|45000|2024-01-09|Completed|
|  TXN008|Bangalore| Mobile|  Electronics|28000|2024-01-09|Completed|
|  TXN009|   Mumbai| Laptop|  Electronics|55000|2024-01-10|Completed|
|  TXN009|   Mumbai| Laptop|  Electronics|55000|2024-01-10|Completed|
+--------+---------+-------+-------------+-----+----------+---------+



In [36]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- device: string (nullable = true)
 |-- typeDevice: string (nullable = true)
 |-- price: long (nullable = true)
 |-- orederDate: string (nullable = true)
 |-- status: string (nullable = true)



# Trim space and normailize data

In [37]:
from pyspark.sql.functions import col, trim, lower, upper, regexp_replace, when
from pyspark.sql.functions import to_date
from pyspark.sql.types import IntegerType

#COLUMN OPERATIONS

In [38]:

from pyspark.sql.functions import col, round, when

df = df.withColumn("price_with_tax", round(col("price") * 1.18, 2))

In [39]:
df = df.withColumn("total_amt", (col("price")+col("price_with_tax")))

In [40]:
df.show()

+--------+---------+-------+-------------+-----+----------+---------+--------------+---------+
|order_id|     city| device|   typeDevice|price|orederDate|   status|price_with_tax|total_amt|
+--------+---------+-------+-------------+-----+----------+---------+--------------+---------+
|  TXN001|   Delhi | Laptop|  Electronics|45000|2024-01-05|Completed|       53100.0|  98100.0|
|  TXN002|   Mumbai|Mobile |  electronics|32000|2024-01-05|Completed|       37760.0|  69760.0|
|  TXN003|Bangalore| Tablet| Electronics |30000|2024-01-06|Completed|       35400.0|  65400.0|
|  TXN004|    Delhi| Laptop|  Electronics| NULL|2024-01-07|Cancelled|          NULL|     NULL|
|  TXN005|  Chennai| Mobile|  Electronics| NULL|2024-01-08|Completed|          NULL|     NULL|
|  TXN006|   Mumbai| Tablet|  Electronics| NULL|2024-01-08|Completed|          NULL|     NULL|
|  TXN007|    Delhi| Laptop|  electronics|45000|2024-01-09|Completed|       53100.0|  98100.0|
|  TXN008|Bangalore| Mobile|  Electronics|28000|20

# ANALYTICS TRANSFORMATIONS

# Total revenue per city

In [43]:
from pyspark.sql import functions as F

revenue_per_city = (
    df.filter(df.price.isNotNull())
      .groupBy("city")
      .agg(F.sum("price").alias("total_revenue"))
)

In [44]:
df = df.withColumn("price", df["price"].cast("double"))

In [46]:
df.show()

+--------+---------+-------+-------------+-------+----------+---------+--------------+---------+
|order_id|     city| device|   typeDevice|  price|orederDate|   status|price_with_tax|total_amt|
+--------+---------+-------+-------------+-------+----------+---------+--------------+---------+
|  TXN001|   Delhi | Laptop|  Electronics|45000.0|2024-01-05|Completed|       53100.0|  98100.0|
|  TXN002|   Mumbai|Mobile |  electronics|32000.0|2024-01-05|Completed|       37760.0|  69760.0|
|  TXN003|Bangalore| Tablet| Electronics |30000.0|2024-01-06|Completed|       35400.0|  65400.0|
|  TXN004|    Delhi| Laptop|  Electronics|   NULL|2024-01-07|Cancelled|          NULL|     NULL|
|  TXN005|  Chennai| Mobile|  Electronics|   NULL|2024-01-08|Completed|          NULL|     NULL|
|  TXN006|   Mumbai| Tablet|  Electronics|   NULL|2024-01-08|Completed|          NULL|     NULL|
|  TXN007|    Delhi| Laptop|  electronics|45000.0|2024-01-09|Completed|       53100.0|  98100.0|
|  TXN008|Bangalore| Mobile|  

In [47]:
revenue_per_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|       142000|
|   Delhi |        45000|
|    Delhi|        45000|
+---------+-------------+



# Average per city

In [48]:
avg_order_value_per_city = (
    df.filter(df.price.isNotNull())
      .groupBy("city")
      .agg(F.avg("price").alias("avg_order_value"))
)
avg_order_value_per_city.show()


+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|Bangalore|           29000.0|
|   Mumbai|47333.333333333336|
|   Delhi |           45000.0|
|    Delhi|           45000.0|
+---------+------------------+



# Top 3 cities by revenue

In [49]:
top_cities_by_revenue = (
    df.filter(df.price.isNotNull())
      .groupBy("city")
      .agg(F.sum("price").alias("total_revenue"))
      .orderBy(F.desc("total_revenue"))
      .limit(3)
)
top_cities_by_revenue.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|   Mumbai|     142000.0|
|Bangalore|      58000.0|
|   Delhi |      45000.0|
+---------+-------------+



# Identify Products with Average Amount > 40.00

In [51]:
products_avg_gt_40 = (
    df.filter(df.price.isNotNull())
      .groupBy("device")
      .agg(F.avg("price").alias("avg_amount"))
      .filter(F.col("avg_amount") > 40.00)
)
products_avg_gt_40.show()

+-------+----------+
| device|avg_amount|
+-------+----------+
| Laptop|   50000.0|
|Mobile |   32000.0|
| Tablet|   30000.0|
| Mobile|   28000.0|
+-------+----------+



# Check number of partitions

In [52]:

print("Current partitions:", df.rdd.getNumPartitions())

Current partitions: 2


# Repartition by city


In [53]:
df_repartitioned = df.repartition("city")

print("Partitions after repartition:", df_repartitioned.rdd.getNumPartitions())

Partitions after repartition: 1


# Observe the plan

In [54]:

revenue_per_city.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, 'sum('price) AS total_revenue#166]
+- Filter isnotnull(price#111L)
   +- Project [order_id#107, city#108, device#109, typeDevice#110, price#111L, orederDate#112, status#113, price_with_tax#136, (cast(price#111L as double) + price_with_tax#136) AS total_amt#137]
      +- Project [order_id#107, city#108, device#109, typeDevice#110, price#111L, orederDate#112, status#113, round((cast(price#111L as double) * 1.18), 2) AS price_with_tax#136]
         +- LogicalRDD [order_id#107, city#108, device#109, typeDevice#110, price#111L, orederDate#112, status#113], false

== Analyzed Logical Plan ==
city: string, total_revenue: bigint
Aggregate [city#108], [city#108, sum(price#111L) AS total_revenue#166L]
+- Filter isnotnull(price#111L)
   +- Project [order_id#107, city#108, device#109, typeDevice#110, price#111L, orederDate#112, status#113, price_with_tax#136, (cast(price#111L as double) + price_with_tax#136) AS total_amt#137]
      +- Project [

# Data to Parquet

In [55]:

df.write.mode("overwrite").parquet("data/parquet/sales")

# Write data to ORC

In [56]:
df.write.mode("overwrite").orc("data/orc/sales")