In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import sum, count, to_date, col
# Spark session
builder = SparkSession.builder \
    .appName("GoldAggregation") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Silver'dan oku
df_silver = spark.read.format("delta").load("../delta/silver/online_retail_cleaned")

# GOLD #1: Günlük satış toplamı
daily_sales = df_silver \
    .withColumn("SaleDate", to_date("InvoiceDate")) \
    .groupBy("SaleDate") \
    .agg(sum("Quantity").alias("TotalQuantity"),
         sum(df_silver["Quantity"] * df_silver["UnitPrice"]).alias("TotalRevenue")) \
    .orderBy("SaleDate")

# GOLD #2: En çok satılan ürünler (top 10)
top_products = df_silver \
    .groupBy("Description") \
    .agg(sum("Quantity").alias("TotalSold")) \
    .orderBy(col("TotalSold").desc()) \
    .limit(10)

# GOLD #3: Ülke bazlı satış
country_sales = df_silver \
    .groupBy("Country") \
    .agg(sum(df_silver["Quantity"] * df_silver["UnitPrice"]).alias("CountryRevenue")) \
    .orderBy(col("CountryRevenue").desc())

# Kaydet
daily_sales.write.format("delta").mode("overwrite").save("../delta/gold/daily_sales")
top_products.write.format("delta").mode("overwrite").save("../delta/gold/top_products")
country_sales.write.format("delta").mode("overwrite").save("../delta/gold/country_sales")

daily_sales.show(5)
top_products.show(5)
country_sales.show(5)

print("✅ Gold aggregation işlemi tamamlandı.")

:: loading settings :: url = jar:file:/opt/miniconda3/envs/spark-delta-env/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/alialtunoglu/.ivy2/cache
The jars for the packages stored in: /Users/alialtunoglu/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2b255f35-4e5d-4589-9124-a070183b1fc8;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 79ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   

25/07/28 14:33:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/07/28 14:33:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/28 14:33:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


NameError: name 'col' is not defined