In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Retail_sales") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", "/home/jovyan/jars/postgresql-42.7.2.jar") \
    .getOrCreate()

jdbc_url = "jdbc:postgresql://postgres:5432/sql_book_o_reilly"
properties = {
    "user": "postgres",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

df = spark.read.jdbc(url=jdbc_url, table="retail_sales", properties=properties)
df.show(5)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
df.createOrReplaceTempView("retail_sales")

In [None]:
from pyspark.sql.functions import year, to_date, sum as spark_sum , first_value 
from pyspark.sql.functions import year, sum as _sum, col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

### YEAR extract from sales_month

In [None]:
df = df.withColumn("YEAR",year(to_date("sales_month","yyyy-MM")))
df.show(2)

+-----------+----------+--------------------+---------------+--------------------+----+
|sales_month|naics_code|    kind_of_business|reason_for_null|               sales|YEAR|
+-----------+----------+--------------------+---------------+--------------------+----+
| 1992-01-01|       441|Motor vehicle and...|           NULL|29811.00000000000...|1992|
| 1992-01-01|      4411|  Automobile dealers|           NULL|25800.00000000000...|1992|
+-----------+----------+--------------------+---------------+--------------------+----+
only showing top 2 rows



In [None]:
spark.sql("SELECT sales_month, " \
"   naics_code, " \
"kind_of_business, " \
"reason_for_null," \
"sales,date_part('year',sales_month) as YEAR" \
" FROM retail_sales").show(2)

+-----------+----------+--------------------+---------------+--------------------+----+
|sales_month|naics_code|    kind_of_business|reason_for_null|               sales|YEAR|
+-----------+----------+--------------------+---------------+--------------------+----+
| 1992-01-01|       441|Motor vehicle and...|           NULL|29811.00000000000...|1992|
| 1992-01-01|      4411|  Automobile dealers|           NULL|25800.00000000000...|1992|
+-----------+----------+--------------------+---------------+--------------------+----+
only showing top 2 rows



### sales per year

In [None]:

df.withColumn("year", year(to_date("sales_month", "yyyy-MM"))) \
  .groupBy("year") \
  .agg(spark_sum("sales").alias("total_sales")) \
  .orderBy("year") \
  .show(2)

+----+--------------------+
|year|         total_sales|
+----+--------------------+
|1992|15710631.00000000...|
|1993|16784256.00000000...|
+----+--------------------+
only showing top 2 rows



In [None]:
spark.sql("select date_part('year',sales_month) as year," \
"   sum(sales) as total_sales " \
"   from retail_sales " \
"   group by 1" \
"   order by 1").show(2)

+----+--------------------+
|year|         total_sales|
+----+--------------------+
|1992|15710631.00000000...|
|1993|16784256.00000000...|
+----+--------------------+
only showing top 2 rows



### Total Sales per business

In [None]:
df.groupBy("kind_of_business") \
  .agg(spark_sum("sales").alias("total_sales")) \
  .orderBy("kind_of_business") \
  .show(2)

+--------------------+--------------------+
|    kind_of_business|         total_sales|
+--------------------+--------------------+
|All other gen. me...|1406616.000000000...|
|All other home fu...|515039.0000000000...|
+--------------------+--------------------+
only showing top 2 rows



In [None]:
spark.sql("select kind_of_business," \
"   sum(sales) as total_sales " \
"   from retail_sales " \
"   group by 1" \
"   order by 1").show(2)

+--------------------+--------------------+
|    kind_of_business|         total_sales|
+--------------------+--------------------+
|All other gen. me...|1406616.000000000...|
|All other home fu...|515039.0000000000...|
+--------------------+--------------------+
only showing top 2 rows



### count of different kind of business line.

In [None]:
df.select('kind_of_business').distinct().count()

65

In [None]:
spark.sql("select count(distinct(kind_of_business)) from retail_sales").show()

+--------------------------------+
|count(DISTINCT kind_of_business)|
+--------------------------------+
|                              65|
+--------------------------------+



### look at sales at women’s clothing stores and at men’s clothing stores per year.

In [None]:

df_filtered = df.filter(
    df.kind_of_business.isin("Men's clothing stores", "Women's clothing stores")
).withColumn("sales_year", year("sales_month")) \
 .groupBy("sales_year", "kind_of_business") \
 .agg(_sum("sales").alias("total_sales")) \
 .orderBy("sales_year", "kind_of_business")

df_filtered.show(5)


+----------+--------------------+--------------------+
|sales_year|    kind_of_business|         total_sales|
+----------+--------------------+--------------------+
|      1992|Men's clothing st...|10179.00000000000...|
|      1992|Women's clothing ...|31815.00000000000...|
|      1993|Men's clothing st...|9962.000000000000...|
|      1993|Women's clothing ...|32350.00000000000...|
|      1994|Men's clothing st...|10032.00000000000...|
+----------+--------------------+--------------------+
only showing top 5 rows



In [None]:
spark.sql("""
    SELECT 
    DATE_PART('year', sales_month) AS sales_year,
    kind_of_business,
    SUM(sales) AS total_sales
FROM retail_sales
WHERE kind_of_business IN (
    "Men's clothing stores", "Women's clothing stores"
)
GROUP BY 1, 2
ORDER BY 1, 2
""").show(5)

+----------+--------------------+--------------------+
|sales_year|    kind_of_business|         total_sales|
+----------+--------------------+--------------------+
|      1992|Men's clothing st...|10179.00000000000...|
|      1992|Women's clothing ...|31815.00000000000...|
|      1993|Men's clothing st...|9962.000000000000...|
|      1993|Women's clothing ...|32350.00000000000...|
|      1994|Men's clothing st...|10032.00000000000...|
+----------+--------------------+--------------------+
only showing top 5 rows



### Pivoting Men's and Women's

In [None]:
df_filtered.groupBy('sales_year').\
    pivot('kind_of_business').\
    sum('total_sales').\
    orderBy('sales_year').\
    show(4)

+----------+---------------------+-----------------------+
|sales_year|Men's clothing stores|Women's clothing stores|
+----------+---------------------+-----------------------+
|      1992| 10179.00000000000...|   31815.00000000000...|
|      1993| 9962.000000000000...|   32350.00000000000...|
|      1994| 10032.00000000000...|   30585.00000000000...|
|      1995| 9315.000000000000...|   28696.00000000000...|
+----------+---------------------+-----------------------+
only showing top 4 rows



In [None]:
spark.sql("""
    SELECT 
        date_part('year', sales_month) AS year,
        SUM(CASE 
            WHEN kind_of_business = "Men's clothing stores" 
            THEN sales 
        END) AS mens_sales,
        SUM(CASE 
            WHEN kind_of_business = "Women's clothing stores" 
            THEN sales 
        END) AS womens_sales
    FROM retail_sales
    WHERE kind_of_business IN ("Men's clothing stores", "Women's clothing stores")
    GROUP BY year
    ORDER BY 1
""").show(4)


+----+--------------------+--------------------+
|year|          mens_sales|        womens_sales|
+----+--------------------+--------------------+
|1992|10179.00000000000...|31815.00000000000...|
|1993|9962.000000000000...|32350.00000000000...|
|1994|10032.00000000000...|30585.00000000000...|
|1995|9315.000000000000...|28696.00000000000...|
+----+--------------------+--------------------+
only showing top 4 rows



### transform the sales_month column -- spliting month, date, and year(already done).

In [None]:
df.printSchema()

root
 |-- sales_month: date (nullable = true)
 |-- naics_code: string (nullable = true)
 |-- kind_of_business: string (nullable = true)
 |-- reason_for_null: string (nullable = true)
 |-- sales: decimal(38,18) (nullable = true)
 |-- YEAR: integer (nullable = true)



In [None]:
df = df.withColumn("month",F.month(F.col("sales_month")))
df_transformed = df.withColumn("date",F.dayofmonth(F.col("sales_month")))

In [None]:
df_transformed.show(3)

+-----------+----------+--------------------+---------------+--------------------+----+-----+----+
|sales_month|naics_code|    kind_of_business|reason_for_null|               sales|YEAR|month|date|
+-----------+----------+--------------------+---------------+--------------------+----+-----+----+
| 1992-01-01|       441|Motor vehicle and...|           NULL|29811.00000000000...|1992|    1|   1|
| 1992-01-01|      4411|  Automobile dealers|           NULL|25800.00000000000...|1992|    1|   1|
| 1992-01-01|4411, 4412|Automobile and ot...|           NULL|26788.00000000000...|1992|    1|   1|
+-----------+----------+--------------------+---------------+--------------------+----+-----+----+
only showing top 3 rows



In [None]:
spark.sql("" 
"   SELECT *, " \
"       date_part('year',sales_month) as year, " \
"       date_part('month',sales_month) as month, " \
"       date_part('day',sales_month) as day " \
"   FROM retail_sales").show(3)

+-----------+----------+--------------------+---------------+--------------------+----+-----+---+
|sales_month|naics_code|    kind_of_business|reason_for_null|               sales|year|month|day|
+-----------+----------+--------------------+---------------+--------------------+----+-----+---+
| 1992-01-01|       441|Motor vehicle and...|           NULL|29811.00000000000...|1992|    1|  1|
| 1992-01-01|      4411|  Automobile dealers|           NULL|25800.00000000000...|1992|    1|  1|
| 1992-01-01|4411, 4412|Automobile and ot...|           NULL|26788.00000000000...|1992|    1|  1|
+-----------+----------+--------------------+---------------+--------------------+----+-----+---+
only showing top 3 rows



### % of each business line

In [None]:
df_grouped = df_transformed.groupBy("kind_of_business").agg(F.sum("sales").alias("total_sales"))
total_sum = df_grouped.agg(F.sum("total_sales")).collect()[0][0]

In [None]:
total_sum

Decimal('926212990.000000000000000000')

In [None]:
df_with_percent = df_grouped.withColumn("sales_%",(F.col("total_sales")/total_sum)*100)
df_with_percent.orderBy(F.desc("sales_%")).show(4)

+--------------------+--------------------+---------+
|    kind_of_business|         total_sales|  sales_%|
+--------------------+--------------------+---------+
|Retail and food s...|118053993.0000000...|12.745900|
|Retail sales and ...|107701613.0000000...|11.628200|
| Retail sales, total|105580364.0000000...|11.399100|
|Retail sales and ...|93509935.00000000...|10.095900|
+--------------------+--------------------+---------+
only showing top 4 rows



In [None]:
spark.sql("""
    WITH total_cte AS (
        SELECT SUM(sales) AS total_amount
        FROM retail_sales
    )
    SELECT 
        r.kind_of_business,
        SUM(r.sales) AS total_sales,
        SUM(r.sales) / t.total_amount AS percent_of_total
    FROM retail_sales r
    CROSS JOIN total_cte t
    GROUP BY r.kind_of_business, t.total_amount
    ORDER BY percent_of_total desc
""").show(4)


+--------------------+--------------------+----------------+
|    kind_of_business|         total_sales|percent_of_total|
+--------------------+--------------------+----------------+
|Retail and food s...|118053993.0000000...|        0.127459|
|Retail sales and ...|107701613.0000000...|        0.116282|
| Retail sales, total|105580364.0000000...|        0.113991|
|Retail sales and ...|93509935.00000000...|        0.100959|
+--------------------+--------------------+----------------+
only showing top 4 rows



### year-wise percentage of each business line

In [None]:
df_yearly = df.withColumn("sales_year", F.year("sales_month"))

df_grouped = df_yearly.groupBy("sales_year", "kind_of_business").agg(F.sum("sales").alias("total_sales"))

window = Window.partitionBy("sales_year")
df_with_percent = df_grouped.withColumn("yearly_total", F.sum("total_sales").over(window))
df_with_percent = df_with_percent.withColumn("sales_percent", (F.col("total_sales") / F.col("yearly_total")) * 100)

df_with_percent.orderBy("sales_year", F.desc("sales_percent")).show(5,truncate=False)

+----------+---------------------------------------------------------------------------------+--------------------------+---------------------------+-------------+
|sales_year|kind_of_business                                                                 |total_sales               |yearly_total               |sales_percent|
+----------+---------------------------------------------------------------------------------+--------------------------+---------------------------+-------------+
|1992      |Retail and food services sales, total                                            |2014102.000000000000000000|15710631.000000000000000000|12.820000    |
|1992      |Retail sales and food services excl gasoline stations                            |1857778.000000000000000000|15710631.000000000000000000|11.825000    |
|1992      |Retail sales, total                                                              |1811237.000000000000000000|15710631.000000000000000000|11.528700    |
|1992      |Reta

In [None]:
spark.sql("""
    WITH yearly_per_business AS (
        SELECT
            year(sales_month) AS year,
            kind_of_business,
            SUM(sales) AS total_sales_per_business
        FROM retail_sales
        GROUP BY 1, 2
    ),  
    yearly_total_calculation AS (  
        SELECT
            year,
            kind_of_business,
            total_sales_per_business,
            SUM(total_sales_per_business) OVER (PARTITION BY year) AS yearly_total
        FROM yearly_per_business
    )
    SELECT
        year,
        kind_of_business,
        total_sales_per_business,
        yearly_total,
        (total_sales_per_business/yearly_total)*100 as sales_percent
    FROM yearly_total_calculation
    ORDER BY year, total_sales_per_business DESC;
""").show(5)


+----+--------------------+------------------------+--------------------+-------------+
|year|    kind_of_business|total_sales_per_business|        yearly_total|sales_percent|
+----+--------------------+------------------------+--------------------+-------------+
|1992|Retail and food s...|    2014102.000000000...|15710631.00000000...|    12.820000|
|1992|Retail sales and ...|    1857778.000000000...|15710631.00000000...|    11.825000|
|1992| Retail sales, total|    1811237.000000000...|15710631.00000000...|    11.528700|
|1992|Retail sales and ...|    1595709.000000000...|15710631.00000000...|    10.156900|
|1992|Retail sales and ...|    1439385.000000000...|15710631.00000000...|     9.161900|
+----+--------------------+------------------------+--------------------+-------------+
only showing top 5 rows



### INDEXING using the business line of women and men

In [None]:
df_women_and_men = df.filter(
    ((F.col("kind_of_business") == "Women's clothing stores") |
     (F.col("kind_of_business") == "Men's clothing stores")) &
    (F.col("sales_month") <= F.lit("2019-12-31"))
)

# Extract sales_year
df_yearly_women = df_women_and_men.withColumn("sales_year", F.year(F.col("sales_month")))

# Group by sales_year and sum sales
df_grouped_women = df_yearly_women.groupBy("sales_year","kind_of_business").agg(F.sum("sales").alias("sales"))

# Define the window and apply first_value (partition not needed since it's only one business)
window_f = Window.partitionBy("kind_of_business").orderBy("sales_year")

# Apply first_value
df_first_value_women = df_grouped_women.withColumn(
    "index_sales", F.first_value(F.col("sales")).over(window_f)
)

# Percent change with base year
df_women_pct_change = df_first_value_women.withColumn(
    "pct_change_with_base_year",
    ((F.col("sales") / F.col("index_sales")) - 1) * 100
)

df_women_pct_change.orderBy("sales_year").show(5, truncate=False)


+----------+-----------------------+------------------------+------------------------+-------------------------+
|sales_year|kind_of_business       |sales                   |index_sales             |pct_change_with_base_year|
+----------+-----------------------+------------------------+------------------------+-------------------------+
|1992      |Men's clothing stores  |10179.000000000000000000|10179.000000000000000000|0.000000                 |
|1992      |Women's clothing stores|31815.000000000000000000|31815.000000000000000000|0.000000                 |
|1993      |Men's clothing stores  |9962.000000000000000000 |10179.000000000000000000|-2.131800                |
|1993      |Women's clothing stores|32350.000000000000000000|31815.000000000000000000|1.681600                 |
|1994      |Men's clothing stores  |10032.000000000000000000|10179.000000000000000000|-1.444100                |
+----------+-----------------------+------------------------+------------------------+----------

In [None]:
spark.sql("""
    SELECT 
          sales_year, 
          sales,
          kind_of_business,
          ((sales / first_value(sales) OVER (PARTITION BY kind_of_business ORDER BY sales_year)) - 1) * 100 
              AS pct_from_index
    FROM (
         SELECT 
             year(sales_month) AS sales_year,
             kind_of_business,
             SUM(sales) AS sales
         FROM retail_sales
         WHERE kind_of_business IN ("Men's clothing stores","Women's clothing stores")
           AND sales_month <= '2019-12-31'
         GROUP BY year(sales_month), kind_of_business
    ) a
    ORDER BY 1
""").show(5)


+----------+--------------------+--------------------+--------------+
|sales_year|               sales|    kind_of_business|pct_from_index|
+----------+--------------------+--------------------+--------------+
|      1992|10179.00000000000...|Men's clothing st...|      0.000000|
|      1992|31815.00000000000...|Women's clothing ...|      0.000000|
|      1993|9962.000000000000...|Men's clothing st...|     -2.131800|
|      1993|32350.00000000000...|Women's clothing ...|      1.681600|
|      1994|10032.00000000000...|Men's clothing st...|     -1.444100|
+----------+--------------------+--------------------+--------------+
only showing top 5 rows



### Rolling Time Windows

use a window of 12 months to get rolling annual sales

In [None]:
from pyspark.sql import functions as F

# Filter for Women's clothing stores
df_women = df.filter(F.col("kind_of_business") == "Women's clothing stores") \
             .withColumn("sales_month", F.to_date("sales_month"))

# Anchor month
anchor_month = "2019-12-01"

# Get sales for anchor month
anchor_df = df_women.filter(F.col("sales_month") == anchor_month) \
    .select(
        F.col("sales_month").alias("sales_month"),
        F.col("sales").alias("sales")
    )

# Get rolling months (last 12 months from anchor)
rolling_df = df_women.filter(
    (F.col("sales_month") <= anchor_month) &
    (F.col("sales_month") >= F.add_months(F.lit(anchor_month), -11))
).select(
    F.col("sales_month").alias("rolling_sales_month"),
    F.col("sales").alias("rolling_sales")
)

# Cross join anchor month with rolling months (to simulate SQL join)
result = anchor_df.crossJoin(rolling_df)

result.show(truncate=False)


+-----------+-----------------------+-------------------+-----------------------+
|sales_month|sales                  |rolling_sales_month|rolling_sales          |
+-----------+-----------------------+-------------------+-----------------------+
|2019-12-01 |4496.000000000000000000|2019-01-01         |2511.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-02-01         |2680.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-03-01         |3585.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-04-01         |3604.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-05-01         |3807.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-06-01         |3272.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-07-01         |3261.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-08-01         |3325.000000000000000000|
|2019-12-01 |4496.000000000000000000|2019-09-01         |3080.000000000000000000|
|2019-12-01 |449

In [None]:
spark.sql("""
    SELECT a.sales_month
,a.sales
,b.sales_month as rolling_sales_month
,b.sales as rolling_sales
FROM retail_sales a
JOIN retail_sales b on a.kind_of_business = b.kind_of_business
 and b.sales_month between a.sales_month - interval '11 months'
 and a.sales_month
 and b.kind_of_business = "Women's clothing stores"
WHERE a.kind_of_business = "Women's clothing stores"
and a.sales_month = '2019-12-01'
;
    """).show()

+-----------+--------------------+-------------------+--------------------+
|sales_month|               sales|rolling_sales_month|       rolling_sales|
+-----------+--------------------+-------------------+--------------------+
| 2019-12-01|4496.000000000000...|         2019-01-01|2511.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-02-01|2680.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-03-01|3585.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-04-01|3604.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-05-01|3807.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-06-01|3272.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-07-01|3261.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-08-01|3325.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-09-01|3080.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-10-01|3390.000000000000...|
| 2019-12-01

### Monthly sales and 12-month moving average sales for women’s clothing stores

In [None]:
window_spec = (
    Window
    .orderBy("sales_month")
    .rowsBetween(-11, 0)   # 11 preceding rows + current row
)

df_result = (
    df
    .filter(F.col("kind_of_business") == "Women's clothing stores")
    .withColumn("moving_avg", F.avg("sales").over(window_spec))
    .withColumn("records_count", F.count("sales").over(window_spec))
    .select("sales_month","moving_avg","records_count")
)
df_result.show(4)

+-----------+--------------------+-------------+
|sales_month|          moving_avg|records_count|
+-----------+--------------------+-------------+
| 1992-01-01|1873.000000000000...|            1|
| 1992-02-01|1932.000000000000...|            2|
| 1992-03-01|2089.000000000000...|            3|
| 1992-04-01|2233.000000000000...|            4|
+-----------+--------------------+-------------+
only showing top 4 rows



In [None]:
spark.sql("""
        SELECT 
          sales_month,
          avg(sales) over (order by sales_month rows between 11 preceding and current row) as moving_avg,
          count(sales) over (order by sales_month rows between 11 preceding and current row) as records_count
FROM retail_sales
WHERE kind_of_business = "Women's clothing stores"
;
          """).show(4)

+-----------+--------------------+-------------+
|sales_month|          moving_avg|records_count|
+-----------+--------------------+-------------+
| 1992-01-01|1873.000000000000...|            1|
| 1992-02-01|1932.000000000000...|            2|
| 1992-03-01|2089.000000000000...|            3|
| 1992-04-01|2233.000000000000...|            4|
+-----------+--------------------+-------------+
only showing top 4 rows



### Rolling Time Windows with Sparse Data

In [None]:
# Lets pull the date_dim table from postgres or csv 
df_date = spark.read.jdbc(url=jdbc_url, table="date_dim", properties=properties)
df_date.show(5)

+----------+--------+------------+-----------+-----------+---------+--------------+-----------+-------------+----------+------------+----------+----------------+------------------+-----------------+--------------+------------+--------------------+-------------------+----+------+-------+
|      date|date_key|day_of_month|day_of_year|day_of_week| day_name|day_short_name|week_number|week_of_month|      week|month_number|month_name|month_short_name|first_day_of_month|last_day_of_month|quarter_number|quarter_name|first_day_of_quarter|last_day_of_quarter|year|decade|century|
+----------+--------+------------+-----------+-----------+---------+--------------+-----------+-------------+----------+------------+----------+----------------+------------------+-----------------+--------------+------------+--------------------+-------------------+----+------+-------+
|1770-01-01|17700101|           1|          1|          1|   Monday|           Mon|          1|            1|1770-01-01|           1|   

In [None]:
from pyspark.sql import functions as F

#Filter retail_sales 
retail_sales_filtered = (
    df
    .filter(
        (F.col("kind_of_business") == "Women's clothing stores") &
        (F.month("sales_month").isin([1, 7]))
    )
    .select("sales_month", "sales")
)

# Step 2: Filter date_dim 
date_dim_filtered = (
    df_date
    .filter(
        (F.col("date") == F.col("first_day_of_month")) &
        (F.col("date").between("1993-01-01", "2020-12-01"))
    )
)

# Step 3: Join condition
joined_df = (
    date_dim_filtered.alias("a")
    .join(
        retail_sales_filtered.alias("b"),
        (F.col("b.sales_month").between(F.add_months(F.col("a.date"), -11), F.col("a.date"))),"inner")
    .select("a.date", "b.sales_month", "b.sales")
)

# Final result
joined_df.show(5, False)


+----------+-----------+-----------------------+
|date      |sales_month|sales                  |
+----------+-----------+-----------------------+
|1993-01-01|1992-07-01 |2373.000000000000000000|
|1993-01-01|1993-01-01 |2123.000000000000000000|
|1993-02-01|1992-07-01 |2373.000000000000000000|
|1993-02-01|1993-01-01 |2123.000000000000000000|
|1993-03-01|1992-07-01 |2373.000000000000000000|
+----------+-----------+-----------------------+
only showing top 5 rows



In [None]:
spark.sql("""
        SELECT 
                a.sales_month
                ,a.sales
                ,b.sales_month as rolling_sales_month
                ,b.sales as rolling_sales
        FROM retail_sales a
        JOIN retail_sales b on a.kind_of_business = b.kind_of_business
                and b.sales_month between a.sales_month - interval '11 months'
                and a.sales_month
                and b.kind_of_business = "Women's clothing stores"
        WHERE a.kind_of_business = "Women's clothing stores"
                and a.sales_month = '2019-12-01'
;
  """).show(5)

+-----------+--------------------+-------------------+--------------------+
|sales_month|               sales|rolling_sales_month|       rolling_sales|
+-----------+--------------------+-------------------+--------------------+
| 2019-12-01|4496.000000000000...|         2019-01-01|2511.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-02-01|2680.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-03-01|3585.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-04-01|3604.000000000000...|
| 2019-12-01|4496.000000000000...|         2019-05-01|3807.000000000000...|
+-----------+--------------------+-------------------+--------------------+
only showing top 5 rows

