In [1]:
#Task 1 - Raw Data Ingestion
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6ebc0c260ecceaec2885415502392a6ecb64fe41622cc593f0d07434986c53f0
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DateType
import os

spark = SparkSession.builder.appName("WeatherDataIngestion").getOrCreate()

schema = StructType([
    StructField("City", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("Temperature", FloatType(), True),
    StructField("Humidity", FloatType(), True)
])
# Define path to the raw data
raw_data_path = "/content/sample_data/weather_data.csv"
delta_table_path = "/content/samplw_data/delta/weather_raw"

weather_df = spark.read.csv(raw_data_path, schema=schema, header=True).withColumn("file_name", input_file_name())

if os.path.exists(raw_data_path):
    try:

        weather_df = spark.read.csv(raw_data_path, schema=schema, header=True)

        weather_df.write.format("delta").mode("overwrite").save(delta_table_path)
        print("Data loaded and saved as Delta table.")
    except Exception as e:
        print(f"Error: {e}")
else:
    print(f"File not found: {raw_data_path}")

In [None]:
#Task 2 - Data Cleaning
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WeatherDataCleaning").getOrCreate()


raw_delta_table_path = "/content/sample_data/delta/weather_raw"
cleaned_delta_table_path = "/content/sample_data/delta/weather_cleaned"

raw_weather_df = spark.read.format("delta").load(raw_delta_table_path)
raw_weather_df.show()

cleaned_weather_df = raw_weather_df.na.drop()
cleaned_weather_df.show()

cleaned_weather_df.write.format("delta").mode("overwrite").save(cleaned_delta_table_path)
print("Data cleaned and saved to a new Delta table.")


In [None]:
#Task 3: Data Transformation
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder.appName("WeatherDataTransformation").getOrCreate()

raw_delta_table_path = "/content/sample_data/delta/weather_raw"
cleaned_delta_table_path = "/content/sample_data/delta/weather_cleaned"


cleaned_weather_df = spark.read.format("delta").load(cleaned_delta_table_path)
cleaned_weather_df.show()

transformed_weather_df = cleaned_weather_df.groupBy("City").agg(
    avg("Temperature").alias("AvgTemperature"),
    avg("Humidity").alias("AvgHumidity")
)

transformed_weather_df.show()

transformed_weather_df.write.format("delta").mode("overwrite").save(transformed_delta_table_path)
print("Data transformed and saved to a new Delta table.")


In [None]:
#Task-4 Creating Pipelines
# 2. Add Logging to track progress and errors
import logging
logging.basicConfig(filename='/path/to/pipeline_log.log', level=logging.INFO)

try:
  logging.info(f'Successfully executed {notebook}')

except Exception as e:
  logging.error(f'Failed to execute {notebook}: {e}')


In [None]:
#Error Handling
import os
if not os.path.exists("dbfs:/FileStore/weather_data.csv"):
 raise FileNotFoundError("Weather data file not found")


try
except Exception as e:
  logging.error(f"Error: {str(e)}")
  error_df = spark.createDataFrame([(str(e),)], ["Error"])
  error_df.write.format("delta").mode("append").save("/delta/error_log")

