In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName('Multicity Retail Chain') \
.getOrCreate()

In [64]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

#RAW DATA

In [65]:
sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]

In [66]:
customer_data = [
("C001","Delhi","Premium"),
("C002","Mumbai","Standard"),
("C003","Bangalore","Premium"),
("C004","Chennai","Standard"),
("C005","Mumbai","Premium")
]

In [67]:
city_lookup = [
("Delhi","Tier-1"),
("Mumbai","Tier-1"),
("Bangalore","Tier-1"),
("Chennai","Tier-2")
]

#PHASE 1 — DATA INGESTION & SCHEMA MANAGEMENT

1. Create schemas explicitly for all datasets

In [68]:
sales_schema = StructType([
    StructField("txn_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("txn_date", StringType(), True),
    StructField("status", StringType(), True)
])

In [69]:
customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("customer_type", StringType(), True)
])

In [70]:
city_schema = StructType([
    StructField("city", StringType(), True),
    StructField("tier", StringType(), True)
])

2. Load raw data into DataFrames

In [71]:
sales_df = spark.createDataFrame(sales_data, sales_schema)
customer_df = spark.createDataFrame(customer_data, customer_schema)
city_df = spark.createDataFrame(city_lookup, city_schema)

3. Handle incorrect data types gracefully

In [72]:
from pyspark.sql.functions import col, when
sales_type_safe_df = sales_df.withColumn(
    "amount_safe",
    when(col("amount").rlike("^[0-9]+$"), col("amount").cast("int")).otherwise(None)
)

4. Identify corrupt and invalid records

In [73]:
sales_df.filter(
    col("amount").isNull() |
    (col("amount")=="") |
    (col("amount").rlike("^[0-9]+$"))
)

DataFrame[txn_id: string, city: string, product: string, category: string, amount: string, txn_date: string, status: string]

#PHASE 2 — DATA CLEANING & TRANSFORMATION

5. Trim and normalize string columns

In [74]:
sales_clean_df = sales_df \
.withColumn("city", trim(col("city"))) \
.withColumn("product", trim(col("product")))
sales_clean_df.show()

+------+---------+-------+-------------+-------+----------+---------+
|txn_id|     city|product|     category| amount|  txn_date|   status|
+------+---------+-------+-------------+-------+----------+---------+
|TXN001|    Delhi| Laptop|  Electronics|  45000|2024-01-05|Completed|
|TXN002|   Mumbai| Mobile|  electronics|  32000|05/01/2024|Completed|
|TXN003|Bangalore| Tablet| Electronics |  30000|2024/01/06|Completed|
|TXN004|    Delhi| Laptop|  Electronics|       |2024-01-07|Cancelled|
|TXN005|  Chennai| Mobile|  Electronics|invalid|2024-01-08|Completed|
|TXN006|   Mumbai| Tablet|  Electronics|   NULL|2024-01-08|Completed|
|TXN007|    Delhi| Laptop|  electronics|  45000|09-01-2024|Completed|
|TXN008|Bangalore| Mobile|  Electronics|  28000|2024-01-09|Completed|
|TXN009|   Mumbai| Laptop|  Electronics|  55000|2024-01-10|Completed|
|TXN009|   Mumbai| Laptop|  Electronics|  55000|2024-01-10|Completed|
+------+---------+-------+-------------+-------+----------+---------+



6. Convert category to uppercase

In [75]:
sales_clean_df = sales_clean_df.withColumn("category", upper(col("category")))
sales_clean_df.show()

+------+---------+-------+-------------+-------+----------+---------+
|txn_id|     city|product|     category| amount|  txn_date|   status|
+------+---------+-------+-------------+-------+----------+---------+
|TXN001|    Delhi| Laptop|  ELECTRONICS|  45000|2024-01-05|Completed|
|TXN002|   Mumbai| Mobile|  ELECTRONICS|  32000|05/01/2024|Completed|
|TXN003|Bangalore| Tablet| ELECTRONICS |  30000|2024/01/06|Completed|
|TXN004|    Delhi| Laptop|  ELECTRONICS|       |2024-01-07|Cancelled|
|TXN005|  Chennai| Mobile|  ELECTRONICS|invalid|2024-01-08|Completed|
|TXN006|   Mumbai| Tablet|  ELECTRONICS|   NULL|2024-01-08|Completed|
|TXN007|    Delhi| Laptop|  ELECTRONICS|  45000|09-01-2024|Completed|
|TXN008|Bangalore| Mobile|  ELECTRONICS|  28000|2024-01-09|Completed|
|TXN009|   Mumbai| Laptop|  ELECTRONICS|  55000|2024-01-10|Completed|
|TXN009|   Mumbai| Laptop|  ELECTRONICS|  55000|2024-01-10|Completed|
+------+---------+-------+-------------+-------+----------+---------+



7. Convert amount to integer

In [76]:
from pyspark.sql.functions import col, expr

sales_clean_df = sales_clean_df.withColumn(
    "amount",
    expr("try_cast(amount AS INT)")
)

8. Handle invalid and null amounts

In [77]:
sales_clean_df = sales_clean_df.filter(col("amount").isNotNull())

9. Parse multiple date formats into DateType

In [78]:
sales_clean_df=sales_clean_df.withColumn("txn_date_parsed",coalesce(
                             try_to_timestamp(col("txn_date"), lit("yyyy-MM-dd")),
                             try_to_timestamp(col("txn_date"), lit("dd-MM-yyyy")),
                             try_to_timestamp(col("txn_date"), lit("MM-dd-yyyy")),
                             try_to_timestamp(col("txn_date"), lit("dd/MM/yyyy")),
                             try_to_timestamp(col("txn_date"), lit("MM/dd/yyyy")),
                             try_to_timestamp(col("txn_date"), lit("yyyy/MM/dd"))
                                                          ).cast(DateType()))


10. Remove duplicate transactions

In [79]:
sales_clean_df = sales_clean_df.dropDuplicates(["txn_id"])

11. Keep only Completed transactions

In [80]:
sales_clean_df = sales_clean_df.filter(col("status") == "Completed")

#PHASE 3 — DATA ENRICHMENT & JOINS

12. Join sales data with city lookup

In [81]:
enriched_sales_df = sales_clean_df.join(city_df, "city", "left")

13. Use broadcast join where appropriate


In [82]:
from pyspark.sql.functions import broadcast
enriched_sales_df = sales_clean_df.join(broadcast(city_df), "city", "left")

14. Explain join strategy used


In [83]:
enriched_sales_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Filter (status#2852 = Completed)
:  +- Deduplicate [txn_id#2846]
:     +- Project [txn_id#2846, city#2859, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, cast(coalesce(try_to_timestamp(txn_date#2851, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS txn_date_parsed#2907]
:        +- Filter isnotnull(amount#2906)
:           +- Project [txn_id#2846, city#2859, product#2860, category#2883, try_cast(amount#2850 as int) AS amount#2906, txn_date#2851, stat

15. Enrich sales data with city tier

In [84]:
enriched_sales_df = enriched_sales_df.select("*", col("tier").alias("city_tier"))

#PHASE 4 — ANALYTICS & WINDOW FUNCTIONS

16. Revenue per city

In [85]:
revenue_per_city_df = enriched_sales_df.groupBy("city").agg(sum("amount").alias("total_revenue"))

17. Revenue per product


In [86]:
revenue_product_df = enriched_sales_df.groupBy("product").agg(sum("amount").alias("total_revenue"))

18. Rank cities by total revenue


In [87]:
from pyspark.sql.window import Window
city_rank_window = Window.orderBy(desc("total_revenue"))
ranked_cities_df = revenue_per_city_df.withColumn("rank", rank().over(city_rank_window))

19. Rank products within each city


In [88]:
city_product_window = Window.partitionBy("city").orderBy(desc("amount"))
ranked_products_df = enriched_sales_df.withColumn("rank", rank().over(city_product_window))

20. Identify top-performing city per day

In [89]:
daily_city_window = Window.partitionBy("txn_date").orderBy(desc("amount"))
top_performing_city_df = enriched_sales_df.withColumn("rank", rank().over(daily_city_window)).filter(col("rank")==1)

#PHASE 5 — CACHING, PARTITIONS & OPTIMIZATION

21. Identify reusable DataFrames


In [90]:
enriched_sales_df.cache()

DataFrame[city: string, txn_id: string, product: string, category: string, amount: int, txn_date: string, status: string, txn_date_parsed: date, tier: string, city_tier: string]

22. Apply caching appropriately


In [91]:
enriched_sales_df.count()

6

23. Compare performance with and without cache


In [92]:
enriched_sales_df.unpersist()
enriched_sales_df.cache()
enriched_sales_df.count()

6

24. Repartition data by city



In [93]:
partitioned_df = enriched_sales_df.repartition("city")

25. Explain why partitioning helps

In [94]:
partitioned_df.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- Project [city#2859, txn_id#2846, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, txn_date_parsed#2907, tier#2857, tier#2857 AS city_tier#2950]
   +- Project [city#2859, txn_id#2846, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, txn_date_parsed#2907, tier#2857]
      +- Join LeftOuter, (city#2859 = city#2856)
         :- Filter (status#2852 = Completed)
         :  +- Deduplicate [txn_id#2846]
         :     +- Project [txn_id#2846, city#2859, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, cast(coalesce(try_to_timestamp(txn_date#2851, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_ti

#PHASE 6 — FILE FORMAT STRATEGY

26. Write cleaned data to Parquet


In [95]:
enriched_sales_df.write.mode("overwrite").parquet("cleaned_data.parquet")

In [96]:
parquet_df = spark.read.parquet("cleaned_data.parquet")
parquet_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)
 |-- city_tier: string (nullable = true)



27. Write aggregated data to ORC


In [97]:
enriched_sales_df.write.mode("overwrite").orc("cleaned_data.orc")

In [98]:
df_orc = spark.read.orc("cleaned_data.orc")
df_orc.show()

+---------+------+-------+-------------+------+----------+---------+---------------+------+---------+
|     city|txn_id|product|     category|amount|  txn_date|   status|txn_date_parsed|  tier|city_tier|
+---------+------+-------+-------------+------+----------+---------+---------------+------+---------+
|Bangalore|TXN003| Tablet| ELECTRONICS | 30000|2024/01/06|Completed|     2024-01-06|Tier-1|   Tier-1|
|Bangalore|TXN008| Mobile|  ELECTRONICS| 28000|2024-01-09|Completed|     2024-01-09|Tier-1|   Tier-1|
|    Delhi|TXN007| Laptop|  ELECTRONICS| 45000|09-01-2024|Completed|     2024-01-09|Tier-1|   Tier-1|
|    Delhi|TXN001| Laptop|  ELECTRONICS| 45000|2024-01-05|Completed|     2024-01-05|Tier-1|   Tier-1|
|   Mumbai|TXN002| Mobile|  ELECTRONICS| 32000|05/01/2024|Completed|     2024-01-05|Tier-1|   Tier-1|
|   Mumbai|TXN009| Laptop|  ELECTRONICS| 55000|2024-01-10|Completed|     2024-01-10|Tier-1|   Tier-1|
+---------+------+-------+-------------+------+----------+---------+--------------

28. Compare file structure and size


In [99]:
import os

# Function to get directory size
def get_dir_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

# Parquet file structure and size
parquet_path = "cleaned_data.parquet"
print(f"\n--- Parquet Files ({parquet_path}) ---")
if os.path.exists(parquet_path):
    print("File Structure:")
    for root, dirs, files in os.walk(parquet_path):
        for name in files:
            print(os.path.join(root, name))
    parquet_size = get_dir_size(parquet_path)
    print(f"Total Parquet Size: {parquet_size / (1024 * 1024):.2f} MB")
else:
    print("Parquet directory not found.")

# ORC file structure and size
orc_path = "cleaned_data.orc"
print(f"\n--- ORC Files ({orc_path}) ---")
if os.path.exists(orc_path):
    print("File Structure:")
    for root, dirs, files in os.walk(orc_path):
        for name in files:
            print(os.path.join(root, name))
    orc_size = get_dir_size(orc_path)
    print(f"Total ORC Size: {orc_size / (1024 * 1024):.2f} MB")
else:
    print("ORC directory not found.")



--- Parquet Files (cleaned_data.parquet) ---
File Structure:
cleaned_data.parquet/.part-00099-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet.crc
cleaned_data.parquet/.part-00100-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet.crc
cleaned_data.parquet/.part-00070-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet.crc
cleaned_data.parquet/part-00070-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet
cleaned_data.parquet/part-00099-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet
cleaned_data.parquet/part-00100-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet
cleaned_data.parquet/_SUCCESS
cleaned_data.parquet/.part-00000-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet.crc
cleaned_data.parquet/._SUCCESS.crc
cleaned_data.parquet/.part-00061-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet.crc
cleaned_data.parquet/part-00191-2a882503-6a27-4185-bef2-2a7e1cc56253-c000.snappy.parquet
cleaned_data.parquet/part-00000-2a882503-6a27-4

29. Explain why Avro is not used here


In [100]:
# Avro was not used here primarily because:
# 1. This task focuses on batch processing, data cleaning, transformation, and analytical queries on static datasets. Parquet and ORC are highly optimized for these types of workloads due to their columnar nature.
# 2. Both Parquet and ORC offer excellent data compression and query performance within the Spark ecosystem, often outperforming row-oriented formats like Avro for analytical queries.
# 3. While Avro excels in scenarios requiring robust schema evolution and streaming data ingestion (e.g., Kafka), this particular use case did not present those specific requirements.
# 4. For the given dataset size and operations, Parquet and ORC provided sufficient efficiency and functionality without the added complexity that Avro might introduce for simple batch ETL.

30. Design a future streaming ingestion using Avro

In [101]:
import json

avro_schema = {
    "type": "record",
    "name": "SalesEvent",
    "namespace": "com.multicityretail.events",
    "fields": [
        {"name": "txn_id", "type": "string"},
        {"name": "city", "type": "string"},
        {"name": "product", "type": "string"},
        {"name": "category", "type": "string"},
        {"name": "amount", "type": ["null", "int"], "default": None},
        {"name": "txn_date", "type": "string"},
        {"name": "status", "type": "string"},
        {
            "name": "txn_date_parsed",
            "type": ["null", {"type": "long", "logicalType": "timestamp-millis"}],
            "default": None
        },
        {"name": "tier", "type": ["null", "string"], "default": None},
        {"name": "city_tier", "type": ["null", "string"], "default": None}
    ]
}

# Save the Avro schema to a .avsc file
schema_file_name = "sales_event.avsc"
with open(schema_file_name, "w") as f:
    json.dump(avro_schema, f, indent=2)

# Print the content of the .avsc file
with open(schema_file_name, "r") as f:
    print(f.read())

{
  "type": "record",
  "name": "SalesEvent",
  "namespace": "com.multicityretail.events",
  "fields": [
    {
      "name": "txn_id",
      "type": "string"
    },
    {
      "name": "city",
      "type": "string"
    },
    {
      "name": "product",
      "type": "string"
    },
    {
      "name": "category",
      "type": "string"
    },
    {
      "name": "amount",
      "type": [
        "null",
        "int"
      ],
      "default": null
    },
    {
      "name": "txn_date",
      "type": "string"
    },
    {
      "name": "status",
      "type": "string"
    },
    {
      "name": "txn_date_parsed",
      "type": [
        "null",
        {
          "type": "long",
          "logicalType": "timestamp-millis"
        }
      ],
      "default": null
    },
    {
      "name": "tier",
      "type": [
        "null",
        "string"
      ],
      "default": null
    },
    {
      "name": "city_tier",
      "type": [
        "null",
        "string"
      ],
      "defaul

In [102]:
pip install avro



In [103]:
import avro.datafile
import avro.io
from datetime import datetime, timezone

# Read the Avro file back
deserialized_records = []
with open(output_avro_file, "rb") as fo:
    reader = avro.datafile.DataFileReader(fo, avro.io.DatumReader())
    for record in reader:
        deserialized_records.append(record)
    reader.close()

# Get the deserialized record
deserialized_record = deserialized_records[0]

# Normalize txn_date_parsed in deserialized_record for consistent comparison
if 'txn_date_parsed' in deserialized_record and deserialized_record['txn_date_parsed'] is not None:
    # Convert to UTC datetime object for consistent comparison
    deserialized_record['txn_date_parsed'] = deserialized_record['txn_date_parsed'].astimezone(timezone.utc)

# Trim category field in deserialized_record for consistent comparison
if 'category' in deserialized_record and deserialized_record['category'] is not None:
    deserialized_record['category'] = deserialized_record['category'].strip()

# Trim category field in original sample_record for consistent comparison
original_sample_record_normalized = sample_record.copy()
if 'category' in original_sample_record_normalized and original_sample_record_normalized['category'] is not None:
    original_sample_record_normalized['category'] = original_sample_record_normalized['category'].strip()

# Print the deserialized record to compare with the original sample_record
print(f"\nDeserialized record from {output_avro_file}:")
print(deserialized_record)

# Compare the deserialized record with the normalized original sample_record
is_match = (deserialized_record == original_sample_record_normalized)
print(f"\nDoes the deserialized record match the original sample record? {is_match}")


Deserialized record from sample_sales_event.avro:
{'txn_id': 'TXN003', 'city': 'Bangalore', 'product': 'Tablet', 'category': 'ELECTRONICS', 'amount': 30000, 'txn_date': '2024/01/06', 'status': 'Completed', 'txn_date_parsed': datetime.datetime(2024, 1, 6, 0, 0, tzinfo=datetime.timezone.utc), 'tier': 'Tier-1', 'city_tier': 'Tier-1'}

Does the deserialized record match the original sample record? True


#PHASE 7 — DEBUGGING & ERROR HANDLING

31. Identify common mistakes (intentional bugs)


In [104]:
enriched_sales_df.explain(True)

== Parsed Logical Plan ==
'Project [*, 'tier AS city_tier#2950]
+- Project [city#2859, txn_id#2846, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, txn_date_parsed#2907, tier#2857]
   +- Join LeftOuter, (city#2859 = city#2856)
      :- Filter (status#2852 = Completed)
      :  +- Deduplicate [txn_id#2846]
      :     +- Project [txn_id#2846, city#2859, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, cast(coalesce(try_to_timestamp(txn_date#2851, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS

32. Debug schema mismatch errors


In [105]:
enriched_sales_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)
 |-- city_tier: string (nullable = true)



33. Debug NoneType DataFrame errors


In [106]:
assert enriched_sales_df is not None

34. Use explain() to identify inefficiencies

In [107]:
enriched_sales_df.explain(True)

== Parsed Logical Plan ==
'Project [*, 'tier AS city_tier#2950]
+- Project [city#2859, txn_id#2846, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, txn_date_parsed#2907, tier#2857]
   +- Join LeftOuter, (city#2859 = city#2856)
      :- Filter (status#2852 = Completed)
      :  +- Deduplicate [txn_id#2846]
      :     +- Project [txn_id#2846, city#2859, product#2860, category#2883, amount#2906, txn_date#2851, status#2852, cast(coalesce(try_to_timestamp(txn_date#2851, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#2851, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS

#PHASE 8 — FINAL VALIDATION & DELIVERABLES

35. Validate record counts


In [108]:
enriched_sales_df.count()

6

36. Ensure no nulls in critical fields


In [109]:
enriched_sales_df.filter(col("amount").isNull()).count()

0

37. Confirm schema correctness

In [110]:
enriched_sales_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)
 |-- city_tier: string (nullable = true)



38. Document optimization decisions

In [112]:
enriched_sales_df.describe().show()

+-------+---------+------+-------+-------------+------------------+----------+---------+------+---------+
|summary|     city|txn_id|product|     category|            amount|  txn_date|   status|  tier|city_tier|
+-------+---------+------+-------+-------------+------------------+----------+---------+------+---------+
|  count|        6|     6|      6|            6|                 6|         6|        6|     6|        6|
|   mean|     NULL|  NULL|   NULL|         NULL|39166.666666666664|      NULL|     NULL|  NULL|     NULL|
| stddev|     NULL|  NULL|   NULL|         NULL|10759.491933482113|      NULL|     NULL|  NULL|     NULL|
|    min|Bangalore|TXN001| Laptop| ELECTRONICS |             28000|05/01/2024|Completed|Tier-1|   Tier-1|
|    max|   Mumbai|TXN009| Tablet|  ELECTRONICS|             55000|2024/01/06|Completed|Tier-1|   Tier-1|
+-------+---------+------+-------+-------------+------------------+----------+---------+------+---------+

