In [2]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import col, max as spark_max, count, sum as spark_sum, datediff, to_date, lit

# SparkSession'ı başlat

builder = SparkSession.builder \
    .appName("Gold Layer - RFM") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Silver verisini yükle
df_silver = spark.read.format("delta").load("../delta/silver/online_retail_cleaned")

# Bugünün tarihi
latest_date = df_silver.agg(spark_max("InvoiceDate")).collect()[0][0]

# RFM hesaplama
rfm = df_silver \
    .withColumn("InvoiceDateOnly", to_date("InvoiceDate")) \
    .groupBy("CustomerID") \
    .agg(
        datediff(lit(latest_date), spark_max("InvoiceDateOnly")).alias("Recency"),
        count("InvoiceNo").alias("Frequency"),
        spark_sum(col("Quantity") * col("UnitPrice")).alias("Monetary")
    )

rfm.write.format("delta").mode("overwrite").save("../delta/gold/rfm_table")

rfm.show(5)



+----------+-------+---------+-----------------+
|CustomerID|Recency|Frequency|         Monetary|
+----------+-------+---------+-----------------+
|   15039.0|      9|     1502|19914.43999999999|
|   13259.0|     61|       87|292.3199999999999|
|   16982.0|     60|       22|           384.06|
|   17966.0|     37|       68|          1098.43|
|   13178.0|     26|      265|5725.469999999999|
+----------+-------+---------+-----------------+
only showing top 5 rows

