In [55]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import time

In [56]:
spark = SparkSession.builder \
    .appName("Order Processing Pipeline") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

#PHASE 1

#1

In [57]:

df_raw = spark.read.csv("orders.csv", header=True, inferSchema=False)


#2

In [58]:

df_raw.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



#3

In [59]:

total_records = df_raw.count()

#4

In [60]:

df_raw.show(5)


+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
only showing top 5 rows


#5


- Reading with "inferSchema=False" treats all columns as StringType
- This is essential because:
  a) Mixed data formats
  b) Invalid values in numeric columns
  C) Gives full control over type conversion and error handling
- We can safely clean and validate before converting to proper types


#PHASE 2

#1

In [61]:
df_cleaned= df_raw.withColumn("city", trim(col("city"))) \
                    .withColumn("category", trim(col("category"))) \
                    .withColumn("product", trim(col("product")))
df_cleaned.select("city", "category", "product").show(5, truncate=False)

+---------+-----------+-----------+
|city     |category   |product    |
+---------+-----------+-----------+
|hyderabad|grocery    |Oil        |
|Pune     |Grocery    |Sugar      |
|Pune     |Electronics|Mobile     |
|Bangalore|Electronics|Laptop     |
|Pune     |Home       |AirPurifier|
+---------+-----------+-----------+
only showing top 5 rows


#2

In [62]:
df_cleaned = df_cleaned.withColumn("city", initcap(col("city"))) \
                       .withColumn("category", initcap(col("category"))) \
                       .withColumn("product", initcap(col("product")))
df_cleaned.select("city", "category", "product").show(5, truncate=False)


+---------+-----------+-----------+
|city     |category   |product    |
+---------+-----------+-----------+
|Hyderabad|Grocery    |Oil        |
|Pune     |Grocery    |Sugar      |
|Pune     |Electronics|Mobile     |
|Bangalore|Electronics|Laptop     |
|Pune     |Home       |Airpurifier|
+---------+-----------+-----------+
only showing top 5 rows


#3

In [63]:

df_cleaned = df_cleaned.withColumn("amount_stripped",
    regexp_replace(col("amount"), ",", "")
)

df_cleaned = df_cleaned.withColumn("amount_clean",
    expr("try_cast(amount_stripped as int)")
).drop("amount_stripped")

df_cleaned.select("amount", "amount_clean").show(10, truncate=False)

+-------+------------+
|amount |amount_clean|
+-------+------------+
|invalid|NULL        |
|35430  |35430       |
|65358  |65358       |
|5558   |5558        |
|33659  |33659       |
|8521   |8521        |
|42383  |42383       |
|45362  |45362       |
|10563  |10563       |
|63715  |63715       |
+-------+------------+
only showing top 10 rows


#4

In [64]:
from pyspark.sql.functions import udf
from datetime import datetime

def safe_parse_date(date_str):
    if date_str is None or date_str.strip() == "":
        return None

    formats = ['%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d']
    for fmt in formats:
        try:
            return datetime.strptime(date_str.strip(), fmt).date()
        except:
            continue
    return None

parse_date_udf = udf(safe_parse_date, DateType())

df_cleaned = df_cleaned.withColumn("order_date_clean",
    parse_date_udf(col("order_date"))
)

df_cleaned.select("order_date", "order_date_clean").show(10, truncate=False)


+----------+----------------+
|order_date|order_date_clean|
+----------+----------------+
|01/01/2024|2024-01-01      |
|2024-01-02|2024-01-02      |
|2024-01-03|2024-01-03      |
|2024-01-04|2024-01-04      |
|2024-01-05|2024-01-05      |
|2024-01-06|2024-01-06      |
|2024-01-07|2024-01-07      |
|2024-01-08|2024-01-08      |
|2024-01-09|2024-01-09      |
|2024-01-10|2024-01-10      |
+----------+----------------+
only showing top 10 rows


#5

In [65]:

print("Columns in dataset:", df_cleaned.columns)

Columns in dataset: ['order_id', 'customer_id', 'city', 'category', 'product', 'amount', 'order_date', 'status', 'amount_clean', 'order_date_clean']


#PHASE 3

#1

In [66]:

invalid_amounts = df_cleaned.filter(
    (col("amount").isNotNull()) & (col("amount_clean").isNull())
).count()
print(f"Records with invalid amounts: {invalid_amounts}")

Records with invalid amounts: 15790


#2

In [67]:

invalid_dates = df_cleaned.filter(
    (col("order_date").isNotNull()) & (col("order_date_clean").isNull())
).count()
print(f"Records with invalid dates: {invalid_dates}")

Records with invalid dates: 2595


#3

In [68]:
duplicate_orders = df_cleaned.groupBy("order_id").count().filter(col("count") > 1)
duplicate_count = duplicate_orders.count()
print(f"Number of duplicate order_ids: {duplicate_count}")
duplicate_orders.show(5)


Number of duplicate order_ids: 0
+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



#4

In [69]:
row_count_before = df_cleaned.count()
df_deduped = df_cleaned.dropDuplicates(["order_id"])
row_count_after = df_deduped.count()
print(f"Rows before deduplication: {row_count_before}")
print(f"Rows after deduplication: {row_count_after}")
print(f"Rows removed: {row_count_before - row_count_after}")

Rows before deduplication: 300000
Rows after deduplication: 300000
Rows removed: 0


#5

In [70]:
row_count_before_filter = df_deduped.count()
df_completed = df_deduped.filter(col("status") == "Completed")
row_count_after_filter = df_completed.count()
print(f"Rows before filtering: {row_count_before_filter}")
print(f"Rows after filtering (Completed only): {row_count_after_filter}")


Rows before filtering: 300000
Rows after filtering (Completed only): 285000


#6

In [71]:

print("\n6. Row counts at every stage:")
print(f"Stage 1 - Raw data: {total_records}")
print(f"Stage 2 - After cleaning: {df_cleaned.count()}")
print(f"Stage 3 - After deduplication: {df_deduped.count()}")
print(f"Stage 4 - After status filter: {df_completed.count()}")



6. Row counts at every stage:
Stage 1 - Raw data: 300000
Stage 2 - After cleaning: 300000
Stage 3 - After deduplication: 300000
Stage 4 - After status filter: 285000


#PHASE 4

#1

In [72]:
num_partitions_before = df_completed.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions_before}")

Number of partitions: 2


#2

In [73]:
revenue_by_city = df_completed.groupBy("city").agg(
    sum("amount_clean").alias("total_revenue")
)
revenue_by_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|   1628527093|
|  Chennai|   1629865247|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
|     Pune|   1646196535|
|    Delhi|   1639639916|
|Hyderabad|   1642443340|
+---------+-------------+



#3

In [74]:
revenue_by_city.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, 'sum('amount_clean) AS total_revenue#1120]
+- Filter (status#642 = Completed)
   +- Deduplicate [order_id#635]
      +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_clean#722, safe_parse_date(order_date#641)#731 AS order_date_clean#732]
         +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_clean#722]
            +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_stripped#721, try_cast(amount_stripped#721 as int) AS amount_clean#722]
               +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, regexp_replace(amount#640, ,, , 1) AS amount_stripped#721]
                  +- Project [order_id#635, customer_id#636, city#705, categor

#4


- This happens during the groupBy operation
- Data is redistributed across executors based on city hash
- Indicated by 'Exchange' operator in the physical plan

#5

In [75]:

df_repartitioned = df_completed.repartition(col("city"))
num_partitions_after = df_repartitioned.rdd.getNumPartitions()
print(f"Partitions after repartition: {num_partitions_after}")

Partitions after repartition: 3


#6

In [76]:

print("\nBEFORE REPARTITION:")
df_completed.groupBy("city").agg(sum("amount_clean")).explain(True)

print("\nAFTER REPARTITION:")
df_repartitioned.groupBy("city").agg(sum("amount_clean")).explain(True)


BEFORE REPARTITION:
== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum('amount_clean))]
+- Filter (status#642 = Completed)
   +- Deduplicate [order_id#635]
      +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_clean#722, safe_parse_date(order_date#641)#731 AS order_date_clean#732]
         +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_clean#722]
            +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_stripped#721, try_cast(amount_stripped#721 as int) AS amount_clean#722]
               +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, regexp_replace(amount#640, ,, , 1) AS amount_stripped#721]
                  +- Project [order_id#635, customer_id#636, c

#PHASE 5

#1

In [77]:

revenue_per_city = df_completed.groupBy("city").agg(
    sum("amount_clean").alias("total_revenue")
)
revenue_per_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|   1628527093|
|  Chennai|   1629865247|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
|     Pune|   1646196535|
|    Delhi|   1639639916|
|Hyderabad|   1642443340|
+---------+-------------+



#2

In [78]:
revenue_per_category = df_completed.groupBy("category").agg(
    sum("amount_clean").alias("total_revenue")
)
revenue_per_category.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|   2868467576|
|    Fashion|   2834182172|
|    Grocery|   2866272106|
|Electronics|   2867568870|
+-----------+-------------+



#3

In [79]:
avg_order_value = df_completed.groupBy("city").agg(
    avg("amount_clean").alias("avg_order_value")
)
avg_order_value.show()

+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|Bangalore|44098.867908689645|
|  Chennai| 43628.27900315863|
|   Mumbai| 43723.75651612556|
|  Kolkata|43709.816662630175|
|     Pune|43930.204013556424|
|    Delhi| 43817.20780331374|
|Hyderabad| 43708.74045293664|
+---------+------------------+



#4

In [80]:

top_products = df_completed.groupBy("product").agg(
    sum("amount_clean").alias("total_revenue")
).orderBy(col("total_revenue").desc()).limit(10)
top_products.show()

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|        Oil|    963572869|
|     Laptop|    962496295|
|     Tablet|    960719999|
|     Vacuum|    959149427|
|      Mixer|    957140026|
|       Rice|    954494237|
|Airpurifier|    952178123|
|      Jeans|    951286127|
|      Sugar|    948205000|
|      Shoes|    946799102|
+-----------+-------------+



#5

In [81]:

cities_by_revenue = revenue_per_city.orderBy(col("total_revenue").desc())
cities_by_revenue.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|     Pune|   1646196535|
|Hyderabad|   1642443340|
|    Delhi|   1639639916|
|  Chennai|   1629865247|
|Bangalore|   1628527093|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
+---------+-------------+



#PHASE 6

#1

In [82]:

window_spec_city = Window.orderBy(col("total_revenue").desc())
cities_ranked = revenue_per_city.withColumn(
    "rank", dense_rank().over(window_spec_city)
)
cities_ranked.show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|     Pune|   1646196535|   1|
|Hyderabad|   1642443340|   2|
|    Delhi|   1639639916|   3|
|  Chennai|   1629865247|   4|
|Bangalore|   1628527093|   5|
|   Mumbai|   1625518096|   6|
|  Kolkata|   1624300497|   7|
+---------+-------------+----+



#2

In [83]:

product_revenue = df_completed.groupBy("category", "product").agg(
    sum("amount_clean").alias("revenue")
)
window_spec_category = Window.partitionBy("category").orderBy(col("revenue").desc())
products_ranked = product_revenue.withColumn(
    "rank_in_category", dense_rank().over(window_spec_category)
)
products_ranked.orderBy("category", "rank_in_category").show(20, truncate=False)


+-----------+-----------+---------+----------------+
|category   |product    |revenue  |rank_in_category|
+-----------+-----------+---------+----------------+
|Electronics|Laptop     |962496295|1               |
|Electronics|Tablet     |960719999|2               |
|Electronics|Mobile     |944352576|3               |
|Fashion    |Jeans      |951286127|1               |
|Fashion    |Shoes      |946799102|2               |
|Fashion    |Tshirt     |936096943|3               |
|Grocery    |Oil        |963572869|1               |
|Grocery    |Rice       |954494237|2               |
|Grocery    |Sugar      |948205000|3               |
|Home       |Vacuum     |959149427|1               |
|Home       |Mixer      |957140026|2               |
|Home       |Airpurifier|952178123|3               |
+-----------+-----------+---------+----------------+



#3

In [84]:

top_product_per_category = products_ranked.filter(col("rank_in_category") == 1)
top_product_per_category.show(truncate=False)

+-----------+-------+---------+----------------+
|category   |product|revenue  |rank_in_category|
+-----------+-------+---------+----------------+
|Electronics|Laptop |962496295|1               |
|Fashion    |Jeans  |951286127|1               |
|Grocery    |Oil    |963572869|1               |
|Home       |Vacuum |959149427|1               |
+-----------+-------+---------+----------------+



#4

In [85]:

top_3_cities = cities_ranked.filter(col("rank") <= 3)
top_3_cities.show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|     Pune|   1646196535|   1|
|Hyderabad|   1642443340|   2|
|    Delhi|   1639639916|   3|
+---------+-------------+----+



#PHASE 7

#1

In [87]:

city_region_data = [
    ("Delhi", "North"),
    ("Mumbai", "West"),
    ("Bangalore", "South"),
    ("Hyderabad", "South"),
    ("Pune", "West"),
    ("Chennai", "South"),
    ("Kolkata", "East")
]
city_region_df = spark.createDataFrame(city_region_data, ["city", "region"])
city_region_df.show()

+---------+------+
|     city|region|
+---------+------+
|    Delhi| North|
|   Mumbai|  West|
|Bangalore| South|
|Hyderabad| South|
|     Pune|  West|
|  Chennai| South|
|  Kolkata|  East|
+---------+------+



#2

In [88]:

df_with_region = df_completed.join(
    broadcast(city_region_df),
    on="city",
    how="left"
)
df_with_region.select("order_id", "city", "region", "amount_clean").show(10)


+-----------+---------+------+------------+
|   order_id|     city|region|amount_clean|
+-----------+---------+------+------------+
|ORD00000001|     Pune|  West|       35430|
|ORD00000007|     Pune|  West|       45362|
|ORD00000008|Bangalore| South|       10563|
|ORD00000010|Bangalore| South|       66576|
|ORD00000011|  Kolkata|  East|       50318|
|ORD00000012|Bangalore| South|       84768|
|ORD00000014|   Mumbai|  West|       79469|
|ORD00000015|     Pune|  West|       81018|
|ORD00000017|Bangalore| South|       69582|
|ORD00000019|   Mumbai|  West|        NULL|
+-----------+---------+------+------------+
only showing top 10 rows


#3

In [89]:

df_with_region.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Filter (status#642 = Completed)
:  +- Deduplicate [order_id#635]
:     +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_clean#722, safe_parse_date(order_date#641)#731 AS order_date_clean#732]
:        +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_clean#722]
:           +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, amount_stripped#721, try_cast(amount_stripped#721 as int) AS amount_clean#722]
:              +- Project [order_id#635, customer_id#636, city#705, category#706, product#707, amount#640, order_date#641, status#642, regexp_replace(amount#640, ,, , 1) AS amount_stripped#721]
:                 +- Project [order_id#635, customer_id#636, city#705, category#706, initcap(product#691) AS produ

#4


1. Small Dataset Size: city_region table has only 7 rows
2. Avoids Shuffle: Small table is broadcast to all executors
3. No Data Movement: Large orders table stays partitioned
4. Memory Efficient: 7 rows easily fit in executor memory
5. Performance: Eliminates expensive shuffle operation
6. Use Case: Perfect for dimension tables in star schema

#PHASE 8

#1

In [90]:

def classify_order_value(amount):
    if amount is None:
        return "Unknown"
    elif amount >= 80000:
        return "High"
    elif amount >= 40000:
        return "Medium"
    else:
        return "Low"

classify_udf = udf(classify_order_value, StringType())

df_classified = df_completed.withColumn(
    "order_value_category",
    classify_udf(col("amount_clean"))
)

df_classified.select("order_id", "amount_clean", "order_value_category").show(15)


+-----------+------------+--------------------+
|   order_id|amount_clean|order_value_category|
+-----------+------------+--------------------+
|ORD00000001|       35430|                 Low|
|ORD00000007|       45362|              Medium|
|ORD00000008|       10563|                 Low|
|ORD00000010|       66576|              Medium|
|ORD00000011|       50318|              Medium|
|ORD00000012|       84768|                High|
|ORD00000014|       79469|              Medium|
|ORD00000015|       81018|                High|
|ORD00000017|       69582|              Medium|
|ORD00000019|        NULL|             Unknown|
|ORD00000022|       48832|              Medium|
|ORD00000023|       12000|                 Low|
|ORD00000024|       18082|                 Low|
|ORD00000025|       58248|              Medium|
|ORD00000028|       70675|              Medium|
+-----------+------------+--------------------+
only showing top 15 rows


#2

In [91]:

distribution = df_classified.groupBy("order_value_category").agg(
    count("*").alias("count"),
    sum("amount_clean").alias("total_revenue")
).orderBy("order_value_category")
distribution.show()


+--------------------+------+-------------+
|order_value_category| count|total_revenue|
+--------------------+------+-------------+
|                High| 27936|   2375126256|
|                 Low|121794|   2372490528|
|              Medium|111365|   6688873940|
|             Unknown| 23905|         NULL|
+--------------------+------+-------------+



#PHASE 9

#1

In [92]:

rdd = df_completed.rdd
print(f"RDD created with {rdd.count()} records")
print("Sample RDD element:")
print(rdd.take(1))

RDD created with 285000 records
Sample RDD element:
[Row(order_id='ORD00000001', customer_id='C000001', city='Pune', category='Grocery', product='Sugar', amount='35430', order_date='2024-01-02', status='Completed', amount_clean=35430, order_date_clean=datetime.date(2024, 1, 2))]


#2 a

In [93]:

total_revenue_rdd = rdd.map(lambda row: row.amount_clean if row.amount_clean else 0) \
                       .reduce(lambda a, b: a + b)
print(f"Total Revenue (RDD): {total_revenue_rdd}")

Total Revenue (RDD): 11436490724


#2 b

In [94]:

orders_per_city = rdd.map(lambda row: (row.city, 1)) \
                     .reduceByKey(lambda a, b: a + b) \
                     .collect()
print("Orders per city:")
for city, count in sorted(orders_per_city, key=lambda x: x[1], reverse=True):
    print(f"{city}: {count}")

Orders per city:
Hyderabad: 41041
Pune: 40883
Delhi: 40854
Chennai: 40736
Mumbai: 40612
Kolkata: 40563
Bangalore: 40311


#3


1. OPTIMIZATION:
   - Catalyst Optimizer: Query optimization
   - Tungsten Engine: Memory management & code generation
   - RDDs: No optimization, manual tuning required

2. PERFORMANCE:
   - DataFrames: Columnar storage, predicate pushdown
   - RDDs: Row-based operations, no built-in optimizations

3. API & EASE OF USE:
   - DataFrames: SQL-like operations, declarative
   - RDDs: Procedural, requires more code

4. MEMORY:
   - DataFrames: Efficient columnar format
   - RDDs: Java object overhead

#PHASE 10

#1

Dataset 'df_completed' is used in multiple aggregations

#2

In [96]:

df_cached = df_completed.cache()
df_cached.count()

285000

#3

In [97]:

start_time = time.time()
result1 = df_cached.groupBy("city").count().collect()
result2 = df_cached.groupBy("category").count().collect()
result3 = df_cached.groupBy("product").count().collect()
cached_time = time.time() - start_time
print(f"Time with caching: {cached_time:.2f} seconds")

df_uncached = df_completed
start_time = time.time()
result1 = df_uncached.groupBy("city").count().collect()
result2 = df_uncached.groupBy("category").count().collect()
result3 = df_uncached.groupBy("product").count().collect()
uncached_time = time.time() - start_time
print(f"Time without caching: {uncached_time:.2f} seconds")
print(f"Performance improvement: {((uncached_time - cached_time) / uncached_time * 100):.1f}%")


Time with caching: 11.49 seconds
Time without caching: 10.59 seconds
Performance improvement: -8.6%


#4

In [98]:

print(f"Cached execution: {cached_time:.2f}s")
print(f"Uncached execution: {uncached_time:.2f}s")
print(f"Speedup factor: {(uncached_time / cached_time):.2f}x")

Cached execution: 11.49s
Uncached execution: 10.59s
Speedup factor: 0.92x


#5

In [99]:

df_cached.unpersist()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: int, order_date_clean: date]



1. MEMORY EXHAUSTION:
   - Cached data consumes executor memory
   - Can lead to OutOfMemory errors
   - Reduces memory for task execution


2. WRONG USE CASES:
   - Don't cache if data used only once
   - Don't cache very large datasets
   - Don't cache if transformations are cheap

3. STORAGE LEVELS:
   - MEMORY_ONLY: Fast but risky
   - MEMORY_AND_DISK: Safer but slower
   - Choose based on use case


#PHASE 11

#1

In [101]:

df_completed.write.mode("overwrite").partitionBy("city").parquet("output/orders_parquet")



#2

In [102]:

revenue_per_city.write.mode("overwrite").orc("output/revenue_by_city_orc")
revenue_per_category.write.mode("overwrite").orc("output/revenue_by_category_orc")


✓ Aggregated data written to ORC format


#3

In [104]:

df_parquet = spark.read.parquet("output/orders_parquet")
print("Parquet Schema:")
df_parquet.printSchema()
print(f"Parquet Row Count: {df_parquet.count()}")

print("\nReading and validating ORC format:")
df_orc = spark.read.orc("output/revenue_by_city_orc")
print("ORC Schema:")
df_orc.printSchema()
print(f"ORC Row Count: {df_orc.count()}")


Parquet Schema:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)
 |-- city: string (nullable = true)

Parquet Row Count: 285000

Reading and validating ORC format:
ORC Schema:
root
 |-- city: string (nullable = true)
 |-- total_revenue: long (nullable = true)

ORC Row Count: 7


#4

In [108]:

print("\n4. Comparing storage formats:")
import os

def get_dir_size(path):
    total = 0
    try:
        for entry in os.scandir(path):
            if entry.is_file():
                total += entry.stat().st_size
            elif entry.is_dir():
                total += get_dir_size(entry.path)
    except:
        pass
    return total

csv_size = get_dir_size("orders.csv") if os.path.exists("orders.csv") else 0
parquet_size = get_dir_size("output/orders_parquet")
orc_size = get_dir_size("output/revenue_by_city_orc")

print(f"\nStorage Size Comparison:")
print(f"CSV: {csv_size / 1024:.2f} KB")
print(f"Parquet: {parquet_size / 1024:.2f} KB")
print(f"ORC: {orc_size / 1024:.2f} KB")
if csv_size > 0:
    print(f"Compression ratio (CSV to Parquet): {(csv_size / parquet_size):.2f}x")



4. Comparing storage formats:

Storage Size Comparison:
CSV: 0.00 KB
Parquet: 6201.48 KB
ORC: 0.55 KB



PARQUET:
- Columnar storage
- Excellent compression
- Schema evolution support


ORC:
- Optimized Row Columnar
- Better compression than Parquet
- Native Hive format


CSV:
- Row-based
- No compression
- Human-readable


#PHASE 12


The problem is that .show() returns None, not a DataFrame.

1. df.filter(df.amount > 50000) → Returns a DataFrame
2. .show() → Displays data and returns None
3. df = None → Variable df is now None, not a DataFrame

EXECUTION FLOW:
filtered_df = df.filter(df.amount > 50000)  # This is a DataFrame
result = filtered_df.show()                   # result is None
df = result                                   # df is now None!

CORRECT APPROACH:
Separate operations
df_filtered = df.filter(df.amount > 50000)
df_filtered.show()




#PHASE 13

#1

In [107]:

print("\n1. Confirming data types and null validation:")

print("\na) Checking amount_clean is IntegerType:")
amount_type = df_completed.schema["amount_clean"].dataType
print(f"amount_clean type: {amount_type}")
assert isinstance(amount_type, IntegerType), "amount_clean is not IntegerType!"
print(" Confirmed: amount_clean is IntegerType")

print("\nb) Checking order_date_clean is DateType:")
date_type = df_completed.schema["order_date_clean"].dataType
print(f"order_date_clean type: {date_type}")
assert isinstance(date_type, DateType), "order_date_clean is not DateType!"
print(" Confirmed: order_date_clean is DateType")

print("\nc) Checking for nulls in critical business fields:")
critical_fields = ["order_id", "customer_id", "city", "category", "product", "status"]
for field in critical_fields:
    null_count = df_completed.filter(col(field).isNull()).count()
    print(f"{field}: {null_count} nulls")

print("\nNull counts for cleaned fields:")
print(f"amount_clean nulls: {df_completed.filter(col('amount_clean').isNull()).count()}")
print(f"order_date_clean nulls: {df_completed.filter(col('order_date_clean').isNull()).count()}")



1. Confirming data types and null validation:

a) Checking amount_clean is IntegerType:
amount_clean type: IntegerType()
✓ Confirmed: amount_clean is IntegerType

b) Checking order_date_clean is DateType:
order_date_clean type: DateType()
✓ Confirmed: order_date_clean is DateType

c) Checking for nulls in critical business fields:
order_id: 0 nulls
customer_id: 0 nulls
city: 0 nulls
category: 0 nulls
product: 0 nulls
status: 0 nulls

Null counts for cleaned fields:
amount_clean nulls: 23905
order_date_clean nulls: 2465
