# Scheduling ETL Jobs

In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from datetime import datetime, timedelta
import os

In [0]:
# Let's define our data path
base_path = "/pyspark/video-streaming-data"
events_path = f"{base_path}/module5-orchestration/scheduling/daily_events"

# First, let's list all the daily event files to see what we're working with
event_files = dbutils.fs.ls(events_path)
for file_info in event_files:
    if file_info.name.endswith('.csv'):
        print(f"Found event file: {file_info.name}")


In [0]:
# Let's load one of the daily event files to explore the data
sample_file_path = f"{events_path}/events_2023-09-01.csv"
daily_events_df = spark.read.option("header", "true").option("inferSchema", "true").csv(sample_file_path)

print("Sample of daily streaming events:")
daily_events_df.limit(5).display()

In [0]:
# Now, let's define a processing function that we would use in our Airflow DAG
def process_daily_events(execution_date):
    """
    Process daily video streaming events
    This function would be called by Airflow for each execution date
    """
    # Format the date as expected in our filenames
    date_str = execution_date.strftime("%Y-%m-%d")
    file_path = f"{events_path}/events_{date_str}.csv"
    
    # Check if file exists
    try:
        # Load the daily events
        print(f"Processing events for {date_str}")
        daily_df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)
        
        # Perform ETL operations
        # 1. Clean data
        cleaned_df = daily_df.dropDuplicates(["event_id"]).na.fill("", ["error_type"])
        
        # 2. Transform - calculate metrics
        metrics_df = cleaned_df.groupBy("device_type", "country") \
            .agg(
                F.count("*").alias("total_events"),
                F.avg("duration_seconds").alias("avg_duration"),
                F.sum(F.when(F.col("error_type") != "", 1).otherwise(0)).alias("error_count")
            )
        
        # 3. Load - save processed data in partitioned format
        output_path = f"{base_path}/module5-orchestration/scheduling/processed_metrics/date={date_str}"
        metrics_df.write.mode("overwrite").parquet(output_path)
        
        print(f"Successfully processed and saved metrics for {date_str}")
        return {"date": date_str, "record_count": daily_df.count(), "status": "success"}
    
    except Exception as e:
        print(f"Error processing {date_str}: {str(e)}")
        return {"date": date_str, "status": "failed", "error": str(e)}


In [0]:
# Let's manually run this function to test it for a sample date
test_date = datetime(2023, 9, 1)
result = process_daily_events(test_date)
print(f"Processing result: {result}")

In [0]:
# Let's check the output data
processed_path = f"{base_path}/module5-orchestration/scheduling/processed_metrics/date=2023-09-01"
try:
    processed_df = spark.read.parquet(processed_path)
    print("Processed metrics data:")
    processed_df.display()
except:
    print("No processed data found - check if processing function executed successfully")
