In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, DateType
from pyspark.sql import functions as F
from pyspark.sql import Window

# Define the schema
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DecimalType(10, 2), True),
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),
    StructField("LineTotal", DecimalType(20, 2), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", DateType(), True)
])

# Load the data with the schema
sales_order_detail_df = spark.read.format('csv').option('header', 'false').schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail__2_.csv")

# Extract Year and Month from ModifiedDate
sales_order_detail_df = sales_order_detail_df.withColumn("OrderYear", F.year("ModifiedDate")) \
                                             .withColumn("OrderMonth", F.month("ModifiedDate"))

# Calculate total sale per SalesOrderID per month
monthly_sales_df = sales_order_detail_df.groupBy(
    "OrderYear",
    "OrderMonth",
).agg(
    F.max("LineTotal").alias("HighestSale")
)

# Find the highest sale each month using a window function
window_spec = Window.partitionBy("OrderYear", "OrderMonth").orderBy(F.desc("HighestSale"))

highest_monthly_sales_df = monthly_sales_df.withColumn(
    "Rank", F.rank().over(window_spec)
).filter(F.col("Rank") == 1).drop("Rank")

# Display the result
highest_monthly_sales_df.select("OrderYear", "OrderMonth", "HighestSale").orderBy("OrderYear", "OrderMonth").show()


# Show the DataFrame
# df.show()
# df.printSchema()

+---------+----------+-----------+
|OrderYear|OrderMonth|HighestSale|
+---------+----------+-----------+
|     2011|         5|   12149.96|
|     2011|         6|    3578.27|
|     2011|         7|   27055.76|
|     2011|         8|   21101.79|
|     2011|         9|    3578.27|
|     2011|        10|   23190.65|
|     2011|        11|    3578.27|
|     2011|        12|   16319.95|
|     2012|         1|   20249.94|
|     2012|         2|   14279.96|
|     2012|         3|   23020.13|
|     2012|         4|    9944.66|
|     2012|         5|   16665.60|
|     2012|         6|   19149.76|
|     2012|         7|   16665.60|
|     2012|         8|   13976.49|
|     2012|         9|   14128.74|
|     2012|        10|   15306.13|
|     2012|        11|   12951.34|
|     2012|        12|   12811.78|
+---------+----------+-----------+
only showing top 20 rows

