# Data Overview (Electronics Reviews)

This notebook demonstrates how to use the EDA utilities under `src/eda/` to compute quick statistics and visualizations from the processed dataset.


In [None]:
from pathlib import Path

from pyspark.sql import SparkSession

from src.eda.overview import basic_stats, rating_summary
from src.eda.activity import user_activity, item_popularity, plot_user_activity, plot_item_popularity
from src.eda.ratings import plot_distribution


In [None]:


spark = SparkSession.builder.appName("eda").getOrCreate()

processed_dir = Path("../data/processed")
interactions = spark.read.parquet(str(processed_dir / "interactions.parquet"))

print("Basic stats:", basic_stats(interactions))
print("Rating summary:", rating_summary(interactions))

plot_user_activity(interactions)
plot_item_popularity(interactions)
plot_distribution(interactions)

spark.stop()


Note: For large datasets, the plotting utilities limit sample sizes. You can adjust the `sample` parameter in plotting calls if needed.
