In [None]:
# Task 1: Vehicle Maintenance Data Ingestion
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


spark = SparkSession.builder.appName("VehicleMaintenance").getOrCreate()


file_path = "dbfs:/FileStore/vehicle_maintenance.csv"

try:

    df = spark.read.option("header", "true").csv(file_path)

    df = df.withColumn("ServiceCost", col("ServiceCost").cast("double")) \
           .withColumn("Mileage", col("Mileage").cast("int"))


    df.write.format("delta").mode("overwrite").save("/delta/vehicle_maintenance")
except Exception as e:
    print(f"Error reading the file: {e}")


import os

if not os.path.exists(file_path):
    print("File not found.")

import logging

logging.basicConfig(filename="dbfs:/logs/ingestion_log.log", level=logging.ERROR)

try:
    df = spark.read.option("header", "true").csv(file_path)
except Exception as e:
    logging.error(f"Error ingesting file: {e}")


In [None]:
# Task 2: Data Cleaning
df_cleaned = df.filter((col("ServiceCost") > 0) & (col("Mileage") > 0))
df_cleaned = df_cleaned.dropDuplicates(["VehicleID", "Date"])
df_cleaned.write.format("delta").mode("overwrite").save("/delta/vehicle_maintenance_cleaned")


In [None]:
# Task 3: Vehicle Maintenance Analysis
total_cost_df = df_cleaned.groupBy("VehicleID").sum("ServiceCost").withColumnRenamed("sum(ServiceCost)", "TotalServiceCost")
high_mileage_df = df_cleaned.filter(col("Mileage") > 30000)
total_cost_df.write.format("delta").mode("overwrite").save("/delta/vehicle_maintenance_total_cost")
high_mileage_df.write.format("delta").mode("overwrite").save("/delta/vehicle_high_mileage")


In [None]:
# Task 5: Data Governance with Delta Lake
spark.sql("VACUUM '/delta/vehicle_maintenance_cleaned' RETAIN 0 HOURS")
spark.sql("DESCRIBE HISTORY '/delta/vehicle_maintenance_cleaned'")
