# Compare monthly sales with previous month, same month previous year, first month of year

In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, DoubleType
from datetime import datetime

# Initialize Spark Session (if not already done)
spark = SparkSession.builder.appName("MonthlySalesComparison").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("OrderID", IntegerType(), True),
    StructField("OrderDate", DateType(), True),
    StructField("SalesAmount", DoubleType(), True)
])

# Sample sales data
data = [
    (1, datetime(2023, 1, 15), 1000.0),
    (2, datetime(2023, 2, 20), 1500.0),
    (3, datetime(2023, 2, 25), 1200.0),
    (4, datetime(2023, 3, 15), 1800.0),
    (5, datetime(2023, 4, 10), 1100.0),
    (6, datetime(2023, 5, 25), 1300.0),
    (7, datetime(2023, 5, 30), 900.0),
    (8, datetime(2023, 6, 17), 950.0),
    (9, datetime(2023, 7, 19), 1250.0),
    (10, datetime(2023, 8, 11), 1450.0),
    (11, datetime(2023, 9, 14), 1150.0),
    (12, datetime(2023, 10, 18), 1500.0),
    (13, datetime(2022, 1, 10), 800.0),
    (14, datetime(2022, 2, 22), 1400.0),
    (15, datetime(2022, 3, 15), 1700.0),
    (16, datetime(2022, 4, 10), 1000.0)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)



+-------+----------+-----------+
|OrderID|OrderDate |SalesAmount|
+-------+----------+-----------+
|1      |2023-01-15|1000.0     |
|2      |2023-02-20|1500.0     |
|3      |2023-02-25|1200.0     |
|4      |2023-03-15|1800.0     |
|5      |2023-04-10|1100.0     |
|6      |2023-05-25|1300.0     |
|7      |2023-05-30|900.0      |
|8      |2023-06-17|950.0      |
|9      |2023-07-19|1250.0     |
|10     |2023-08-11|1450.0     |
|11     |2023-09-14|1150.0     |
|12     |2023-10-18|1500.0     |
|13     |2022-01-10|800.0      |
|14     |2022-02-22|1400.0     |
|15     |2022-03-15|1700.0     |
|16     |2022-04-10|1000.0     |
+-------+----------+-----------+



In [2]:
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Orders")



In [3]:
query = spark.sql ("""

   WITH MonthlySales AS (
    SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month, 
        SUM(SalesAmount) AS MonthlySales
    FROM Orders
    GROUP BY YEAR(OrderDate), MONTH(OrderDate)
),
MonthlyComparison AS (
    SELECT 
        Year, 
        Month, 
        MonthlySales,
        LAG(MonthlySales, 1) OVER (ORDER BY Year, Month) AS PreviousMonthSales
    FROM MonthlySales
)
SELECT 
    Year, 
    Month, 
    MonthlySales, 
    PreviousMonthSales, 
    (MonthlySales - PreviousMonthSales) AS DifferenceWithPreviousMonth
FROM MonthlyComparison
ORDER BY Year, Month;
                              
                   
                   """)

query.show()

+----+-----+------------+------------------+---------------------------+
|Year|Month|MonthlySales|PreviousMonthSales|DifferenceWithPreviousMonth|
+----+-----+------------+------------------+---------------------------+
|2022|    1|       800.0|              null|                       null|
|2022|    2|      1400.0|             800.0|                      600.0|
|2022|    3|      1700.0|            1400.0|                      300.0|
|2022|    4|      1000.0|            1700.0|                     -700.0|
|2023|    1|      1000.0|            1000.0|                        0.0|
|2023|    2|      2700.0|            1000.0|                     1700.0|
|2023|    3|      1800.0|            2700.0|                     -900.0|
|2023|    4|      1100.0|            1800.0|                     -700.0|
|2023|    5|      2200.0|            1100.0|                     1100.0|
|2023|    6|       950.0|            2200.0|                    -1250.0|
|2023|    7|      1250.0|             950.0|       

In [4]:
query1 = spark.sql("""                  
  
  WITH MonthlySales AS (
    SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month, 
        SUM(SalesAmount) AS MonthlySales
    FROM Orders
    GROUP BY YEAR(OrderDate), MONTH(OrderDate)
),
YearlyComparison AS (
    SELECT 
        Year, 
        Month, 
        MonthlySales,
        LAG(MonthlySales, 12) OVER (ORDER BY Year, Month) AS PreviousYearSameMonthSales
    FROM MonthlySales
)
SELECT 
    Year, 
    Month, 
    MonthlySales, 
    PreviousYearSameMonthSales, 
    (MonthlySales - PreviousYearSameMonthSales) AS DifferenceWithPreviousYearSameMonth
FROM YearlyComparison
ORDER BY Year, Month;

                   
                   """)

query1.show()

+----+-----+------------+--------------------------+-----------------------------------+
|Year|Month|MonthlySales|PreviousYearSameMonthSales|DifferenceWithPreviousYearSameMonth|
+----+-----+------------+--------------------------+-----------------------------------+
|2022|    1|       800.0|                      null|                               null|
|2022|    2|      1400.0|                      null|                               null|
|2022|    3|      1700.0|                      null|                               null|
|2022|    4|      1000.0|                      null|                               null|
|2023|    1|      1000.0|                      null|                               null|
|2023|    2|      2700.0|                      null|                               null|
|2023|    3|      1800.0|                      null|                               null|
|2023|    4|      1100.0|                      null|                               null|
|2023|    5|      220

In [5]:
query2 = spark.sql("""                
 
 WITH MonthlySales AS (
    SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month, 
        SUM(SalesAmount) AS MonthlySales
    FROM Orders
    GROUP BY YEAR(OrderDate), MONTH(OrderDate)
),
FirstMonthComparison AS (
    SELECT 
        Year, 
        Month, 
        MonthlySales,
        FIRST_VALUE(MonthlySales) OVER (PARTITION BY Year ORDER BY Month) AS FirstMonthSales
    FROM MonthlySales
)
SELECT 
    Year, 
    Month, 
    MonthlySales, 
    FirstMonthSales, 
    (MonthlySales - FirstMonthSales) AS DifferenceWithFirstMonth
FROM FirstMonthComparison
ORDER BY Year, Month;

                   
                   """)

query2.show()

+----+-----+------------+---------------+------------------------+
|Year|Month|MonthlySales|FirstMonthSales|DifferenceWithFirstMonth|
+----+-----+------------+---------------+------------------------+
|2022|    1|       800.0|          800.0|                     0.0|
|2022|    2|      1400.0|          800.0|                   600.0|
|2022|    3|      1700.0|          800.0|                   900.0|
|2022|    4|      1000.0|          800.0|                   200.0|
|2023|    1|      1000.0|         1000.0|                     0.0|
|2023|    2|      2700.0|         1000.0|                  1700.0|
|2023|    3|      1800.0|         1000.0|                   800.0|
|2023|    4|      1100.0|         1000.0|                   100.0|
|2023|    5|      2200.0|         1000.0|                  1200.0|
|2023|    6|       950.0|         1000.0|                   -50.0|
|2023|    7|      1250.0|         1000.0|                   250.0|
|2023|    8|      1450.0|         1000.0|                   45

In [6]:
# Execute Method 1 Query
query1 = spark.sql("""
WITH MonthlySales AS (
    SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month, 
        SUM(SalesAmount) AS MonthlySales
    FROM Orders
    GROUP BY YEAR(OrderDate), MONTH(OrderDate)
),
MonthlyComparison AS (
    SELECT 
        Year, 
        Month, 
        MonthlySales,
        LAG(MonthlySales, 1) OVER (ORDER BY Year, Month) AS PreviousMonthSales
    FROM MonthlySales
)
SELECT 
    Year, 
    Month, 
    MonthlySales, 
    PreviousMonthSales, 
    (MonthlySales - PreviousMonthSales) AS DifferenceWithPreviousMonth
FROM MonthlyComparison
ORDER BY Year, Month;
""")
query1.show(truncate=False)

# Execute Method 2 Query
query2 = spark.sql("""
WITH MonthlySales AS (
    SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month, 
        SUM(SalesAmount) AS MonthlySales
    FROM Orders
    GROUP BY YEAR(OrderDate), MONTH(OrderDate)
),
YearlyComparison AS (
    SELECT 
        Year, 
        Month, 
        MonthlySales,
        LAG(MonthlySales, 12) OVER (ORDER BY Year, Month) AS PreviousYearSameMonthSales
    FROM MonthlySales
)
SELECT 
    Year, 
    Month, 
    MonthlySales, 
    PreviousYearSameMonthSales, 
    (MonthlySales - PreviousYearSameMonthSales) AS DifferenceWithPreviousYearSameMonth
FROM YearlyComparison
ORDER BY Year, Month;
""")
query2.show(truncate=False)

# Execute Method 3 Query
query3 = spark.sql("""
WITH MonthlySales AS (
    SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month, 
        SUM(SalesAmount) AS MonthlySales
    FROM Orders
    GROUP BY YEAR(OrderDate), MONTH(OrderDate)
),
FirstMonthComparison AS (
    SELECT 
        Year, 
        Month, 
        MonthlySales,
        FIRST_VALUE(MonthlySales) OVER (PARTITION BY Year ORDER BY Month) AS FirstMonthSales
    FROM MonthlySales
)
SELECT 
    Year, 
    Month, 
    MonthlySales, 
    FirstMonthSales, 
    (MonthlySales - FirstMonthSales) AS DifferenceWithFirstMonth
FROM FirstMonthComparison
ORDER BY Year, Month;
""")
query3.show(truncate=False)


+----+-----+------------+------------------+---------------------------+
|Year|Month|MonthlySales|PreviousMonthSales|DifferenceWithPreviousMonth|
+----+-----+------------+------------------+---------------------------+
|2022|1    |800.0       |null              |null                       |
|2022|2    |1400.0      |800.0             |600.0                      |
|2022|3    |1700.0      |1400.0            |300.0                      |
|2022|4    |1000.0      |1700.0            |-700.0                     |
|2023|1    |1000.0      |1000.0            |0.0                        |
|2023|2    |2700.0      |1000.0            |1700.0                     |
|2023|3    |1800.0      |2700.0            |-900.0                     |
|2023|4    |1100.0      |1800.0            |-700.0                     |
|2023|5    |2200.0      |1100.0            |1100.0                     |
|2023|6    |950.0       |2200.0            |-1250.0                    |
|2023|7    |1250.0      |950.0             |300.0  

# Pyspark

In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, month, year, lag, col

# Aggregate sales by month and year
monthly_sales = df.groupBy(year("OrderDate").alias("Year"), month("OrderDate").alias("Month")) \
    .agg(sum("SalesAmount").alias("MonthlySales"))

# Define window specification
window_spec = Window.orderBy("Year", "Month")

# Use `lag` to get the previous month's sales
monthly_sales_comparison = monthly_sales.withColumn("PreviousMonthSales", lag("MonthlySales", 1).over(window_spec)) \
    .withColumn("DifferenceWithPreviousMonth", col("MonthlySales") - col("PreviousMonthSales"))

monthly_sales_comparison.show(truncate=False)


+----+-----+------------+------------------+---------------------------+
|Year|Month|MonthlySales|PreviousMonthSales|DifferenceWithPreviousMonth|
+----+-----+------------+------------------+---------------------------+
|2022|1    |800.0       |null              |null                       |
|2022|2    |1400.0      |800.0             |600.0                      |
|2022|3    |1700.0      |1400.0            |300.0                      |
|2022|4    |1000.0      |1700.0            |-700.0                     |
|2023|1    |1000.0      |1000.0            |0.0                        |
|2023|2    |2700.0      |1000.0            |1700.0                     |
|2023|3    |1800.0      |2700.0            |-900.0                     |
|2023|4    |1100.0      |1800.0            |-700.0                     |
|2023|5    |2200.0      |1100.0            |1100.0                     |
|2023|6    |950.0       |2200.0            |-1250.0                    |
|2023|7    |1250.0      |950.0             |300.0  

In [9]:
# Use `lag` with an offset of 12 to get the sales from the same month of the previous year
monthly_sales_yearly_comparison = monthly_sales.withColumn("PreviousYearSameMonthSales", lag("MonthlySales", 12).over(window_spec)) \
    .withColumn("DifferenceWithPreviousYearSameMonth", col("MonthlySales") - col("PreviousYearSameMonthSales"))

monthly_sales_yearly_comparison.show(truncate=False)


+----+-----+------------+--------------------------+-----------------------------------+
|Year|Month|MonthlySales|PreviousYearSameMonthSales|DifferenceWithPreviousYearSameMonth|
+----+-----+------------+--------------------------+-----------------------------------+
|2022|1    |800.0       |null                      |null                               |
|2022|2    |1400.0      |null                      |null                               |
|2022|3    |1700.0      |null                      |null                               |
|2022|4    |1000.0      |null                      |null                               |
|2023|1    |1000.0      |null                      |null                               |
|2023|2    |2700.0      |null                      |null                               |
|2023|3    |1800.0      |null                      |null                               |
|2023|4    |1100.0      |null                      |null                               |
|2023|5    |2200.0   

In [10]:
from pyspark.sql.functions import first

# Use `first` to get the sales from the first month of the year
window_spec_first_month = Window.partitionBy("Year").orderBy("Month").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

monthly_sales_first_month_comparison = monthly_sales.withColumn("FirstMonthSales", first("MonthlySales").over(window_spec_first_month)) \
    .withColumn("DifferenceWithFirstMonth", col("MonthlySales") - col("FirstMonthSales"))

monthly_sales_first_month_comparison.show(truncate=False)


+----+-----+------------+---------------+------------------------+
|Year|Month|MonthlySales|FirstMonthSales|DifferenceWithFirstMonth|
+----+-----+------------+---------------+------------------------+
|2022|1    |800.0       |800.0          |0.0                     |
|2022|2    |1400.0      |800.0          |600.0                   |
|2022|3    |1700.0      |800.0          |900.0                   |
|2022|4    |1000.0      |800.0          |200.0                   |
|2023|1    |1000.0      |1000.0         |0.0                     |
|2023|2    |2700.0      |1000.0         |1700.0                  |
|2023|3    |1800.0      |1000.0         |800.0                   |
|2023|4    |1100.0      |1000.0         |100.0                   |
|2023|5    |2200.0      |1000.0         |1200.0                  |
|2023|6    |950.0       |1000.0         |-50.0                   |
|2023|7    |1250.0      |1000.0         |250.0                   |
|2023|8    |1450.0      |1000.0         |450.0                

In [11]:
# Combine all the comparisons in one DataFrame
combined_sales_comparison = monthly_sales \
    .withColumn("PreviousMonthSales", lag("MonthlySales", 1).over(window_spec)) \
    .withColumn("DifferenceWithPreviousMonth", col("MonthlySales") - col("PreviousMonthSales")) \
    .withColumn("PreviousYearSameMonthSales", lag("MonthlySales", 12).over(window_spec)) \
    .withColumn("DifferenceWithPreviousYearSameMonth", col("MonthlySales") - col("PreviousYearSameMonthSales")) \
    .withColumn("FirstMonthSales", first("MonthlySales").over(window_spec_first_month)) \
    .withColumn("DifferenceWithFirstMonth", col("MonthlySales") - col("FirstMonthSales"))

combined_sales_comparison.show(truncate=False)


+----+-----+------------+------------------+---------------------------+--------------------------+-----------------------------------+---------------+------------------------+
|Year|Month|MonthlySales|PreviousMonthSales|DifferenceWithPreviousMonth|PreviousYearSameMonthSales|DifferenceWithPreviousYearSameMonth|FirstMonthSales|DifferenceWithFirstMonth|
+----+-----+------------+------------------+---------------------------+--------------------------+-----------------------------------+---------------+------------------------+
|2022|1    |800.0       |null              |null                       |null                      |null                               |800.0          |0.0                     |
|2022|2    |1400.0      |800.0             |600.0                      |null                      |null                               |800.0          |600.0                   |
|2022|3    |1700.0      |1400.0            |300.0                      |null                      |null            