In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
file_path = f"/databricks-datasets/online_retail/data-001/*"

In [0]:
try:
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    display(df.limit(10))
except Exception as e:
    print(f"Error reading CSV file: {e}")

In [0]:
# date func. and calculate TotalPrice per row
df = df.withColumn("InvoiceDate", F.to_timestamp("InvoiceDate", "M/d/yy H:mm")) \
       .withColumn("LineTotal", F.col("Quantity") * F.col("UnitPrice"))

In [0]:
# Sample data
customer_data = [
    (17850, "VIP", "United Kingdom"),
    (12345, "Standard", "France")
]
customer_df = spark.createDataFrame(customer_data, ["CustomerID", "Membership", "Region"])


In [0]:
# Left Join to keep all transactions even if customer info is missing
df_joined = df.join(customer_df, on="CustomerID", how="left") \
              .withColumn("Membership", F.coalesce(F.col("Membership"), F.lit("Guest")))

In [0]:
display(df_joined.limit(5))