In [None]:
#Task 1 - Raw Data Ingestion
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os
#weather_data.csv
spark = SparkSession.builder.appName("Weather Data Ingestion").getOrCreate()
schema = StructType([
    StructField("City", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("Temperature", FloatType(), True),
    StructField("Humidity", FloatType(), True)
])

file_path = "/content/sample_data/weather_data.csv"

if os.path.exists(file_path):

    weather_df = spark.read.format("csv").option("header", "true").load(file_path)

    weather_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/weather_raw")
    print("Data ingestion completed successfully.")
else:
    print(f"File {file_path} does not exist.")
    spark.createDataFrame([("File not found",)], ["Error"]).write.mode("append").save("/content/sample_data/delta/ingestion_logs")


In [None]:
#Task 2 Data Cleaning
from pyspark.sql.functions import when, col

weather_df = spark.read.format("delta").load("/content/sample_data/delta/weather_raw")

cleaned_df = weather_df.withColumn(
    "Temperature", when(col("Temperature").isNull() | (col("Temperature") < -50) | (col("Temperature") > 50), None).otherwise(col("Temperature"))
).withColumn(
    "Humidity", when(col("Humidity").isNull() | (col("Humidity") < 0) | (col("Humidity") > 100), None).otherwise(col("Humidity"))
)

cleaned_df = cleaned_df.dropna()

cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/weather_cleaned")
print("Data cleaning completed successfully.")


In [None]:
#Task 3 Data Transition
from pyspark.sql.functions import avg

cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/weather_cleaned")
transformed_df = cleaned_df.groupBy("City").agg(
    avg("Temperature").alias("Average_Temperature"),
    avg("Humidity").alias("Average_Humidity")
)

transformed_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/weather_transformed")
print("Data transformation completed successfully.")


In [None]:
#Task 4 Build and Run a Pipeline
import subprocess
import logging


logging.basicConfig(filename='/content/sample_data/logs/pipeline_log.log', level=logging.INFO)
notebooks = [
    "/content/sample_data/delta/weather_raw",
    "/content/sample_data/delta/weather_cleaned",
    "/content/sample_data/delta/weather_transformed"
]

for notebook in notebooks:
    try:
        subprocess.run(["databricks", "workspace", "import", notebook], check=True)
        logging.info(f"Successfully executed {notebook}")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error occurred while executing {notebook}: {e}")
