## RFM-Analysis

In [None]:
import polars as pl
import os
import datetime as dt

# Define cleaned data path
cleaned_data_path = "C:\\Users\\agste\\Angelos Work Projects\\RFM & Clustering Project\\Data_Lake\\cleaned_data.parquet"

# Load cleaned dataset
print("Loading cleaned dataset...")
df = pl.read_parquet(cleaned_data_path)

# Define the reference date for RFM analysis (max date in dataset + 1 day)
reference_date = df.select(pl.col("InvoiceDate").max())[0, 0] + dt.timedelta(days=1)

# Compute RFM Metrics
rfm = (
    df.groupby("CustomerID")
    .agg(
        (reference_date - pl.col("InvoiceDate").max()).alias("Recency"),
        pl.col("InvoiceNo").n_unique().alias("Frequency"),
        pl.col("Monetary").sum().alias("Monetary"),
    )
)

# Convert Recency to integer days
rfm = rfm.with_columns(pl.col("Recency").dt.days())

# Save RFM data
output_dir = "C:\\Users\\agste\\Angelos Work Projects\\RFM & Clustering Project\\Data_Lake"
os.makedirs(output_dir, exist_ok=True)
rfm_path = os.path.join(output_dir, "rfm_data.parquet")
rfm.write_parquet(rfm_path)

print(f"RFM analysis complete. Data saved to {rfm_path}")
