In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, stddev, min, max

In [None]:
# Get the SparkSession
spark = SparkSession.builder.appName("IrisEDA").getOrCreate()

# Define the path to the ingested data
ingested_data_path = "/mnt/my_delta_lake/iris_ingested" # Ensure this matches the ingestion path

# Read the Delta Lake table
iris_df = spark.read.format("delta").load(ingested_data_path)

# Basic statistics
summary_stats = iris_df.describe()
summary_stats.show()

# Group by species and calculate averages
average_by_species = iris_df.groupBy("species").agg(
    avg("sepal_length").alias("avg_sepal_length"),
    avg("sepal_width").alias("avg_sepal_width"),
    avg("petal_length").alias("avg_petal_length"),
    avg("petal_width").alias("avg_petal_width")
)
average_by_species.show()

# You can add more EDA steps here, such as:
# - Histograms and scatter plots (using libraries like matplotlib or seaborn)
# - Correlation analysis
# - Handling missing values (if any)
# - Outlier detection

# Optionally, save EDA results to Delta Lake
eda_results_path = "/mnt/my_delta_lake/iris_eda_results" # Replace with your desired path
average_by_species.write.format("delta").mode("overwrite").save(eda_results_path)

spark.stop()