In [0]:
# Assuming you are running this in a Databricks notebook
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, lit, coalesce, sum, count, current_timestamp
from pyspark.sql.types import IntegerType, DoubleType, StringType, DateType

# Initialize Spark Session (already available in Databricks notebooks)
# spark = SparkSession.builder.appName("EcomDataExtraction").getOrCreate()


# Define GCS paths
raw_data_path = "gs://batch-processing-de_raw_data/"
processed_data_path = "gs://batch-processing-de_stagging_data/"

# --- Ingest and Basic Cleaning for Orders ---
print("Processing Orders data...")
try:
    df_orders_raw = spark.read.csv(f"{raw_data_path}orders.csv", header=True, inferSchema=True)
    df_orders = df_orders_raw.select(
        col("order_id").cast(IntegerType()).alias("order_id"),
        col("customer_id").cast(IntegerType()).alias("customer_id"),
        to_date(col("order_date"), "yyyy-MM-dd").alias("order_date"), # Assuming YYYY-MM-DD
        col("total_amount").cast(DoubleType()).alias("total_amount"),
        col("status").cast(StringType()).alias("order_status")
    ).na.drop(subset=["order_id", "customer_id", "order_date", "total_amount"]) # Drop rows with essential nulls

    # Add a processing timestamp
    df_orders = df_orders.withColumn("processed_timestamp", current_timestamp())

    # df_orders.show()

    # Write to processed bucket as Parquet (for efficient reading later)
    df_orders.write.mode("overwrite").parquet(f"{processed_data_path}orders_cleaned")
    print(f"Cleaned Orders data written to {processed_data_path}orders_cleaned")

except Exception as e:
    print(f"Error processing orders: {e}")
    # Consider logging to a dedicated logging service

# --- Ingest and Basic Cleaning for Order Items ---
print("Processing Order Items data...")
try:
    df_order_items_raw = spark.read.csv(f"{raw_data_path}order_items.csv", header=True, inferSchema=True)
    df_order_items = df_order_items_raw.select(
        col("order_item_id").cast(IntegerType()).alias("order_item_id"),
        col("order_id").cast(IntegerType()).alias("order_id"),
        col("product_id").cast(IntegerType()).alias("product_id"),
        col("quantity").cast(IntegerType()).alias("quantity"),
        col("unit_price").cast(DoubleType()).alias("unit_price")
    ).na.drop(subset=["order_item_id", "order_id", "product_id", "quantity", "unit_price"])

    df_order_items = df_order_items.withColumn("processed_timestamp", current_timestamp())

    df_order_items.write.mode("overwrite").parquet(f"{processed_data_path}order_items_cleaned")
    print(f"Cleaned Order Items data written to {processed_data_path}order_items_cleaned")

except Exception as e:
    print(f"Error processing order items: {e}")

# --- Ingest and Basic Cleaning for Products ---
print("Processing Products data...")
try:
    df_products_raw = spark.read.csv(f"{raw_data_path}products.csv", header=True, inferSchema=True)
    df_products = df_products_raw.select(
        col("product_id").cast(IntegerType()).alias("product_id"),
        col("product_name").cast(StringType()).alias("product_name"),
        col("category").cast(StringType()).alias("product_category"),
        col("price").cast(DoubleType()).alias("product_price")
    ).na.drop(subset=["product_id", "product_name", "product_price"])

    df_products = df_products.withColumn("processed_timestamp", current_timestamp())

    df_products.write.mode("overwrite").parquet(f"{processed_data_path}products_cleaned")
    print(f"Cleaned Products data written to {processed_data_path}products_cleaned")

except Exception as e:
    print(f"Error processing products: {e}")

# --- Ingest and Basic Cleaning for Customers ---
print("Processing Customers data...")
try:
    df_customers_raw = spark.read.csv(f"{raw_data_path}customers.csv", header=True, inferSchema=True)
    df_customers = df_customers_raw.select(
        col("customer_id").cast(IntegerType()).alias("customer_id"),
        col("first_name").cast(StringType()).alias("first_name"),
        col("last_name").cast(StringType()).alias("last_name"),
        col("email").cast(StringType()).alias("email"),
        to_date(col("registration_date"), "yyyy-MM-dd").alias("registration_date"),
        col("country").cast(StringType()).alias("country")
    ).na.drop(subset=["customer_id", "email", "registration_date"])

    df_customers = df_customers.withColumn("processed_timestamp", current_timestamp())

    df_customers.write.mode("overwrite").parquet(f"{processed_data_path}customers_cleaned")
    print(f"Cleaned Customers data written to {processed_data_path}customers_cleaned")

except Exception as e:
    print(f"Error processing customers: {e}")

# spark.stop() # No need to stop in Databricks notebooks'''
print("All raw data processed and cleaned to Parquet.")

Processing Orders data...
Cleaned Orders data written to gs://batch-processing-de_stagging_data/orders_cleaned
Processing Order Items data...
Cleaned Order Items data written to gs://batch-processing-de_stagging_data/order_items_cleaned
Processing Products data...
Cleaned Products data written to gs://batch-processing-de_stagging_data/products_cleaned
Processing Customers data...
Cleaned Customers data written to gs://batch-processing-de_stagging_data/customers_cleaned
All raw data processed and cleaned to Parquet.
