In [None]:
# notebooks/demo.ipynb

from pyspark.sql import SparkSession
from src.data.generator import DataGenerator
from src.data.loader import DataLoader
from src.model.gmm import ScalableGMM, GMMConfig
from src.utils.metrics import MetricsCollector
from src.utils.validation import DataValidator

# Initialize Spark
spark = SparkSession.builder \
    .appName("GMM_Demo") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Initialize components
metrics = MetricsCollector()
validator = DataValidator()
loader = DataLoader(spark)

# Generate scaled data
generator = DataGenerator(spark, "gs://your-bucket/data/Credit.csv")
metrics.start_operation()
generator.generate_scaled_data("gs://your-bucket/data/scaled_data")
generation_metrics = metrics.end_operation(1_000_000_000, 
                                         memory_used=spark.sparkContext.statusTracker().getExecutorInfos()[0].memoryUsed,
                                         accuracy=1.0)
print(f"Data generation metrics: {generation_metrics}")

# Train and transform
gmm = ScalableGMM(GMMConfig())
data = loader.load_parquet("gs://your-bucket/data/scaled_data")
metrics.start_operation()
gmm.fit(data)
training_metrics = metrics.end_operation(data.count(),
                                       memory_used=spark.sparkContext.statusTracker().getExecutorInfos()[0].memoryUsed,
                                       accuracy=validator.validate_distribution(original_data, 
                                                                            transformed_data)[1])
print(f"Training metrics: {training_metrics}")

# Get overall performance summary
print(f"Performance summary: {metrics.get_summary()}")