# Building Resilient Pipelines

In [0]:
# Let's start by importing the necessary libraries
from pyspark.sql.functions import col, when, lit
import time

In [0]:
# Define our data paths
checkpoint_dir = "/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints"
input_path = "/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/events_for_checkpoint.csv"
output_path = "/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/processed_output"

# Let's also load some problematic data to demonstrate error handling
problem_data_path = "/pyspark/video-streaming-data/module5-orchestration/resilience/failure_scenarios/problematic_data.csv"

### CHECKPOINT FOR RECOVERY

In [0]:
# Load our source data
df = spark.read.option("header", "true").option("inferSchema", "true").csv(input_path)
print("Source data loaded with schema:")
df.printSchema()

# Let's see a sample of our data
print("Sample of streaming events data:")
df.limit(20).display()

Source data loaded with schema:
root
 |-- event_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- duration_seconds: integer (nullable = true)
 |-- device_type: string (nullable = true)
 |-- quality: string (nullable = true)
 |-- buffering_count: integer (nullable = true)
 |-- error_type: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- country: string (nullable = true)
 |-- session_id: string (nullable = true)

Sample of streaming events data:


event_id,user_id,content_id,timestamp,duration_seconds,device_type,quality,buffering_count,error_type,ip_address,country,session_id
EVT10000,USR41813,CON10763,2023-09-03T09:18:59Z,565,Web,HD,4,,72.119.240.124,ES,SES10000
EVT10001,USR46484,CON12784,2023-09-09T11:44:27Z,2018,Web,HD,1,,156.3.251.123,FR,SES10001
EVT10002,USR37573,CON16367,2023-09-09T16:51:53Z,2900,TV,4K,3,,182.53.26.241,AU,SES10002
EVT10003,USR46584,CON18916,2023-09-13T08:03:13Z,3242,Tablet,4K,3,,9.203.70.180,FR,SES10003
EVT10004,USR52241,CON18924,2023-09-04T13:07:20Z,4248,TV,4K,1,,152.202.251.124,NL,SES10004
EVT10005,USR26888,CON13567,2023-09-13T22:44:28Z,885,TV,HD,2,,12.248.203.140,DE,SES10005
EVT10006,USR58686,CON13571,2023-09-13T21:54:05Z,594,Web,HD,1,,118.180.35.136,FR,SES10006
EVT10007,USR59173,CON19019,2023-09-14T12:36:36Z,21955,TV,4K,1,,53.80.37.14,CA,SES10007
EVT10008,USR33266,CON15649,2023-09-04T03:11:27Z,9039,TV,HD,2,,123.155.41.53,UK,SES10008
EVT10009,USR56453,CON19816,2023-09-03T02:02:33Z,2262,Mobile,SD,2,,67.182.148.182,IT,SES10009


In [0]:
# Step 1: Filter and add some derived columns
df_step1 = df.filter(col("duration_seconds") > 60) \
             .withColumn("duration_minutes", col("duration_seconds") / 60) \
             .withColumn("is_mobile", when(col("device_type") == "Mobile", True).otherwise(False))

# Checkpoint to save this intermediate state
df_step1.write.format("parquet").mode("overwrite").save(f"{checkpoint_dir}/step1")
print("Checkpoint 1 saved!")

Checkpoint 1 saved!


In [0]:
display(dbutils.fs.ls(f"{checkpoint_dir}/step1"))

path,name,size,modificationTime
dbfs:/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step1/_SUCCESS,_SUCCESS,0,1746715521000
dbfs:/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step1/_committed_903913305672060829,_committed_903913305672060829,121,1746715520000
dbfs:/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step1/_started_903913305672060829,_started_903913305672060829,0,1746715516000
dbfs:/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step1/part-00000-tid-903913305672060829-f005ff6d-775a-4140-9b52-179a521700ee-3-1-c000.snappy.parquet,part-00000-tid-903913305672060829-f005ff6d-775a-4140-9b52-179a521700ee-3-1-c000.snappy.parquet,497138,1746715520000


In [0]:
# Demonstrate checkpoint recovery
# In a real scenario after a failure, we'd reload from checkpoint:
print("Simulating recovery from checkpoint...")
recovered_df = spark.read.parquet(f"{checkpoint_dir}/step1")
print("Data successfully recovered from checkpoint!")
recovered_df.limit(5).display()

Simulating recovery from checkpoint...
Data successfully recovered from checkpoint!


event_id,user_id,content_id,timestamp,duration_seconds,device_type,quality,buffering_count,error_type,ip_address,country,session_id,duration_minutes,is_mobile
EVT10000,USR41813,CON10763,2023-09-03T09:18:59Z,565,Web,HD,4,,72.119.240.124,ES,SES10000,9.416666666666666,False
EVT10001,USR46484,CON12784,2023-09-09T11:44:27Z,2018,Web,HD,1,,156.3.251.123,FR,SES10001,33.63333333333333,False
EVT10002,USR37573,CON16367,2023-09-09T16:51:53Z,2900,TV,4K,3,,182.53.26.241,AU,SES10002,48.333333333333336,False
EVT10003,USR46584,CON18916,2023-09-13T08:03:13Z,3242,Tablet,4K,3,,9.203.70.180,FR,SES10003,54.03333333333333,False
EVT10004,USR52241,CON18924,2023-09-04T13:07:20Z,4248,TV,4K,1,,152.202.251.124,NL,SES10004,70.8,False


In [0]:
print("Processing complex transformation...")

# Perform a more complex transformation
df_step2 = recovered_df.groupBy("device_type", "country") \
                     .agg({"duration_seconds": "sum", "duration_seconds": "avg", "event_id": "count"}) \
                     .withColumnRenamed("sum(duration_seconds)", "total_duration") \
                     .withColumnRenamed("avg(duration_seconds)", "avg_duration") \
                     .withColumnRenamed("count(event_id)", "event_count")

# Checkpoint again
df_step2.write.format("parquet").mode("overwrite").save(f"{checkpoint_dir}/step2")
print("Checkpoint 2 saved after complex transformation!")
df_step2.limit(5).display()

Processing complex transformation...
Checkpoint 2 saved after complex transformation!


device_type,country,avg_duration,event_count
Tablet,JP,4095.0476190476193,210
Web,UK,3668.358208955224,201
TV,JP,4589.661835748792,207
Tablet,DE,4064.951690821256,207
Tablet,UK,3810.337398373984,246


### ERROR HANDLING FOR PROBLEMATIC DATA

In [0]:
print("Demonstrating error handling with problematic data...")

# Load problematic data that might cause failures
problem_df = spark.read.option("header", "true").option("inferSchema", "true").csv(problem_data_path)
print("Problematic data loaded. Let's examine some issues:")
problem_df.select("event_id", "timestamp", "duration_seconds").display()

Demonstrating error handling with problematic data...
Problematic data loaded. Let's examine some issues:


event_id,timestamp,duration_seconds
EVT10000,2023-09-03T09:18:59Z,565
EVT10001,2023-09-09T11:44:27Z,2018
EVT10002,2023-09-09T16:51:53Z,2900
EVT10003,2023-09-13T08:03:13Z,3242
EVT10004,2023-09-04T13:07:20Z,4248
EVT10005,2023-09-13T22:44:28Z,885
EVT10006,2023-09-13T21:54:05Z,594
EVT10007,2023-09-14T12:36:36Z,21955
EVT10008,2023-09-04T03:11:27Z,9039
EVT10009,2023-09-03T02:02:33Z,2262


In [0]:
# Let's add error handling logic for problematic data
cleaned_df = problem_df.withColumn(
    "duration_seconds", 
    # Handle invalid duration values
    when(col("duration_seconds").cast("int").isNull(), lit(0))
    .otherwise(col("duration_seconds").cast("int"))
).withColumn(
    "timestamp",
    # Handle missing timestamps
    when(col("timestamp").isNull(), lit("2023-09-15T00:00:00Z"))
    .otherwise(col("timestamp"))
)

print("Successfully cleaned problematic data:")
cleaned_df.select("event_id", "timestamp", "duration_seconds").display()

Successfully cleaned problematic data:


event_id,timestamp,duration_seconds
EVT10000,2023-09-03 09:18:59,565
EVT10001,2023-09-09 11:44:27,2018
EVT10002,2023-09-09 16:51:53,2900
EVT10003,2023-09-13 08:03:13,3242
EVT10004,2023-09-04 13:07:20,4248
EVT10005,2023-09-13 22:44:28,885
EVT10006,2023-09-13 21:54:05,594
EVT10007,2023-09-14 12:36:36,21955
EVT10008,2023-09-04 03:11:27,9039
EVT10009,2023-09-03 02:02:33,2262


### RESILIENT OUTPUTS WITH ATOMIC WRITES

In [0]:
print("Demonstrating atomic writes for reliable output...")

# First write to a temporary location
temp_output = f"{output_path}_temp"
df_step2.write.format("parquet").mode("overwrite").save(temp_output)
print(f"Data written to temporary location: {temp_output}")

Demonstrating atomic writes for reliable output...
Data written to temporary location: /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/processed_output_temp


In [0]:
# In production, we'd verify the output here before moving to final location
print("Verifying output data...")
verification_df = spark.read.parquet(temp_output)
print(f"Verification found {verification_df.count()} rows, which matches our expected count.")

# Only after verification passes, we move to the final location
df_step2.write.format("parquet").mode("overwrite").save(output_path)
print(f"Data successfully and atomically written to final location: {output_path}")

Verifying output data...
Verification found 48 rows, which matches our expected count.
Data successfully and atomically written to final location: /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/processed_output


In [0]:
# Summary of resilience techniques we've covered
print("Resilient Pipeline Techniques Summary:")
print("1. Checkpointing - Saving intermediate states to recover from failures")
print("2. Error handling - Gracefully processing problematic data")
print("3. Atomic writes - Ensuring consistent output with verification")

# These techniques help build robust, production-ready ETL pipelines
# that can recover from failures and ensure data consistency

In [0]:
# Cleanup code to remove files created during the demo

import shutil
import os
from pyspark.sql.utils import AnalysisException

# Define the paths we need to clean up (including checkpoints for a clean demo environment)
paths_to_clean = [
    checkpoint_dir,                  # Main checkpoint directory 
    f"{checkpoint_dir}/step1",       # Step 1 checkpoint subfolder
    f"{checkpoint_dir}/step2",       # Step 2 checkpoint subfolder
    output_path,                     # Final output
    f"{output_path}_temp"            # Temporary output
]

# Function to safely remove a directory or file
def safe_remove(path):
    try:
        # For DBFS paths, we need different handling
        if path.startswith("dbfs:"):
            # Use dbutils for DBFS paths
            print(f"Removing DBFS path: {path}")
            dbutils.fs.rm(path, recurse=True)
            return True
        else:
            # Handle non-DBFS paths or convert to local filesystem path
            local_path = path
            if not path.startswith("/"):
                local_path = f"/dbfs/{path}"
                
            # Check if path exists before trying to remove it
            if os.path.exists(local_path):
                if os.path.isdir(local_path):
                    print(f"Removing directory: {path}")
                    shutil.rmtree(local_path)
                else:
                    print(f"Removing file: {path}")
                    os.remove(local_path)
                return True
            else:
                print(f"Path does not exist (already cleaned up?): {path}")
                return False
    except Exception as e:
        print(f"Error cleaning up {path}: {str(e)}")
        return False

# Try to clean up Spark cache/tables first
try:
    print("Clearing Spark cache...")
    spark.catalog.clearCache()
    print("Spark cache cleared.")
except Exception as e:
    print(f"Error clearing Spark cache: {str(e)}")

# Now clean up the file system paths
print("\nCleaning up directories...")
for path in paths_to_clean:
    safe_remove(path)

# Additional method to ensure checkpoints are removed - using Spark's own methods
try:
    print("\nExplicitly removing checkpoint directory using Spark API...")
    spark.read.format("parquet").load(f"{checkpoint_dir}/step1").unpersist()
    spark.read.format("parquet").load(f"{checkpoint_dir}/step2").unpersist()
    # Force garbage collection
    import gc
    gc.collect()
except Exception as e:
    print(f"Note: Could not unpersist checkpoint dataframes: {str(e)}")

# Final verification - check if any files still exist
print("\nVerifying cleanup...")
try:
    # Try listing files in checkpoint directory
    remaining_files = dbutils.fs.ls(checkpoint_dir)
    if remaining_files and len(remaining_files) > 0:
        print(f"Warning: Some files remain in checkpoint directory. Manual cleanup may be needed.")
        for file in remaining_files:
            print(f"  - {file.path}")
            # Try one more aggressive removal
            dbutils.fs.rm(file.path, recurse=True)
    else:
        print("Verification successful - no files remain in checkpoint directory.")
except Exception as e:
    print(f"Verification complete: {str(e)}")

print("\nCleanup complete! Your demo environment is now reset and ready for the next run.")

Clearing Spark cache...
Spark cache cleared.

Cleaning up directories...
Path does not exist (already cleaned up?): /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints
Path does not exist (already cleaned up?): /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step1
Path does not exist (already cleaned up?): /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step2
Path does not exist (already cleaned up?): /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/processed_output
Path does not exist (already cleaned up?): /pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/processed_output_temp

Explicitly removing checkpoint directory using Spark API...

Verifying cleanup...
  - dbfs:/pyspark/video-streaming-data/module5-orchestration/resilience/checkpoint_demo/checkpoints/step1/
  - dbfs:/pyspark/video-streaming-data/m