In [None]:
# Task 1: Data Ingestion - Reading Data from Various Formats

csv_file_path = "dbfs:/FileStore/student_info.csv"

student_df = spark.read.option("header", "true").csv(csv_file_path)

json_file_path = "dbfs:/FileStore/city_info.json"

city_df = spark.read.option("multiline", "true").json(json_file_path)
# File path for Parquet
parquet_file_path = "dbfs:/FileStore/hospital_info.parquet"

# Read Parquet data
hospital_parquet_df = spark.read.parquet(parquet_file_path)

try:
    # Read Delta table
    hospital_delta_df = spark.read.format("delta").load("/delta/hospital_records")
except Exception as e:
    print(f"Error loading Delta table: {e}")




In [None]:
# Task 2: Writing Data to Various Formats
student_df.write.option("header", "true").csv("dbfs:/FileStore/output_student.csv")

city_df.write.json("dbfs:/FileStore/output_city.json")

hospital_parquet_df.write.parquet("dbfs:/FileStore/output_hospital.parquet")

hospital_parquet_df.write.format("delta").save("/delta/output_hospital")

In [None]:
# Task 3: Running One Notebook from Another
# Notebook A (Data Ingestion & Cleaning):
# Load CSV
student_df = spark.read.option("header", "true").csv("dbfs:/FileStore/student_info.csv")

# Clean data: remove duplicates
student_cleaned_df = student_df.dropDuplicates()

# Write cleaned data to Delta table
student_cleaned_df.write.format("delta").save("/delta/student_cleaned")


In [None]:
# Notebook B (Data Analysis):
# Load Delta table from Notebook A
student_df = spark.read.format("delta").load("/delta/student_cleaned")

# Perform analysis
avg_score_df = student_df.groupBy("Class").avg("Score").withColumnRenamed("avg(Score)", "AverageScore")

# Write the analysis to a Delta table
avg_score_df.write.format("delta").save("/delta/student_analysis")


In [None]:
# Task 4: Databricks Ingestion
student_df = spark.read.option("header", "true").csv(/student_info.csv)

json_file_path = "/FileStore/city_info.json"
city_df = spark.read.option("multiline", "true").json(json_file_path)

delta_df = spark.read.format("delta").load("/delta/hospital_records")


In [None]:
# Task 5: Optimization, Z-ordering, and VACUUM
spark.sql("OPTIMIZE '/delta/output_hospital'")

spark.sql("OPTIMIZE '/delta/output_hospital' ZORDER BY (CityName)")

spark.sql("VACUUM '/delta/output_hospital' RETAIN 0 HOURS")

