In [0]:
## /FileStore/tables/Sales_SalesOrderDetail.csv
## /FileStore/tables/Sales_SalesOrderHeader.csv

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, rank
from pyspark.sql.window import Window

In [0]:

# Initialize Spark session
spark = SparkSession.builder.appName("HighestSalePerStoreDaily").getOrCreate()


In [0]:
# Load the data into a DataFrame (assuming a DataFrame named salesOrderDetailDF)
salesOrderDetailDF = spark.read.format("csv").options(header=True).load("/FileStore/tables/Sales_SalesOrderDetail.csv")
salesOrderDetailDF.show()

+-----+---+------------+---+---+---+--------+----+-----------+------------------------------------+-----------------------+
|43659| 11|4911-403C-98| 13|776| 15|2024.994|0.00|2024.994000|B207C96D-D9E6-402B-8470-2CC176C42283|2011-05-31 00:00:00.000|
+-----+---+------------+---+---+---+--------+----+-----------+------------------------------------+-----------------------+
|43659|  2|4911-403C-98|  3|777|  1|2024.994|0.00|6074.982000|                7ABB600D-1E77-41B...|   2011-05-31 00:00:...|
|43659|  3|4911-403C-98|  1|778|  1|2024.994|0.00|2024.994000|                475CF8C6-49F6-486...|   2011-05-31 00:00:...|
|43659|  4|4911-403C-98|  1|771|  1|2039.994|0.00|2039.994000|                04C4DE91-5815-45D...|   2011-05-31 00:00:...|
|43659|  5|4911-403C-98|  1|772|  1|2039.994|0.00|2039.994000|                5A74C7D2-E641-438...|   2011-05-31 00:00:...|
|43659|  6|4911-403C-98|  2|773|  1|2039.994|0.00|4079.988000|                CE472532-A4C0-45B...|   2011-05-31 00:00:...|
|43659| 

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType

schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("UnitPriceDiscount", DoubleType(), True),
    StructField("LineTotal", DoubleType(), True),
    StructField("rowguid", StringType(), True),
    #StructField("ModifiedDate", TimestampType(), True)
])

salesOrderDetailDF = spark.read.format("csv").schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail.csv")
salesOrderDetailDF.show()


+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+
|       43659|                 1|         4911-403C-98|       1|      776|             1| 2024.994|              0.0| 2024.994|B207C96D-D9E6-402...|
|       43659|                 2|         4911-403C-98|       3|      777|             1| 2024.994|              0.0| 6074.982|7ABB600D-1E77-41B...|
|       43659|                 3|         4911-403C-98|       1|      778|             1| 2024.994|              0.0| 2024.994|475CF8C6-49F6-486...|
|       43659|                 4|         4911-403C-98|       1|      771|             1| 2039.994|       

In [0]:
# Load the SalesOrderHeader DataFrame
salesOrderHeaderDF = spark.read.format("csv").option("header", True).load("/FileStore/tables/Sales_SalesOrderHeader.csv")
salesOrderHeaderDF.show()


+-----+---+-----------------------+-----------------------+------------------------+---+---+-------+-------------+--------------+-----+---+---+-----+-----+---+-----+-------------+------+----------+---------+---------+----------+------+------------------------------------+-------------------------+
|43659|  8|2011-05-31 00:00:00.000|2011-06-12 00:00:00.000|2011-06-07 00:00:00.0004| 55|  0|SO43659|  PO522145787|10-4020-000676|29825|279|512|98513|98514|515|16281|105041Vi84182|NULL18|20565.6206|1971.5149| 616.0984|23153.2339|NULL23|79B65321-39CA-4115-9CBA-8FE0903E12E6|2011-06-07 00:00:00.00025|
+-----+---+-----------------------+-----------------------+------------------------+---+---+-------+-------------+--------------+-----+---+---+-----+-----+---+-----+-------------+------+----------+---------+---------+----------+------+------------------------------------+-------------------------+
|43660|  8|   2011-05-31 00:00:...|   2011-06-12 00:00:...|    2011-06-07 00:00:...|  5|  0|SO43660|PO1

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType

schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("RevisionNumber", IntegerType(), True),
    StructField("OrderDate", TimestampType(), True),
    StructField("DueDate", TimestampType(), True),
    StructField("ShipDate", TimestampType(), True),
    StructField("Status", IntegerType(), True),
    StructField("OnlineOrderFlag", IntegerType(), True),
    StructField("SalesOrderNumber", StringType(), True),
    StructField("PurchaseOrderNumber", StringType(), True),
    StructField("AccountNumber", StringType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("SalesPersonID", IntegerType(), True),
    StructField("TerritoryID", IntegerType(), True),
    StructField("BillToAddressID", IntegerType(), True),
    StructField("ShipToAddressID", IntegerType(), True),
    StructField("ShipMethodID", IntegerType(), True),
    StructField("CreditCardID", IntegerType(), True),
    StructField("CreditCardApprovalCode", StringType(), True),
    StructField("CurrencyRateID", IntegerType(), True),
    StructField("SubTotal", DoubleType(), True),
    StructField("TaxAmt", DoubleType(), True),
    StructField("Freight", DoubleType(), True),
    StructField("TotalDue", DoubleType(), True),
    StructField("Comment", StringType(), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", TimestampType(), True)
])

# Example of reading the CSV file with the schema
salesOrderHeaderDF = spark.read.format("csv").schema(schema).load("/FileStore/tables/Sales_SalesOrderHeader.csv")
salesOrderHeaderDF.show()


+------------+--------------+-------------------+-------------------+-------------------+------+---------------+----------------+-------------------+--------------+----------+-------------+-----------+---------------+---------------+------------+------------+----------------------+--------------+----------+---------+---------+----------+-------+--------------------+-------------------+
|SalesOrderID|RevisionNumber|          OrderDate|            DueDate|           ShipDate|Status|OnlineOrderFlag|SalesOrderNumber|PurchaseOrderNumber| AccountNumber|CustomerID|SalesPersonID|TerritoryID|BillToAddressID|ShipToAddressID|ShipMethodID|CreditCardID|CreditCardApprovalCode|CurrencyRateID|  SubTotal|   TaxAmt|  Freight|  TotalDue|Comment|             rowguid|       ModifiedDate|
+------------+--------------+-------------------+-------------------+-------------------+------+---------------+----------------+-------------------+--------------+----------+-------------+-----------+---------------+-----

In [0]:
# Assuming salesOrderDetailDF is already loaded and salesOrderHeaderDF is available
# Joining SalesOrderDetail with SalesOrderHeader to get the ModifiedDate and StoreID
sales_df = salesOrderDetailDF.join(salesOrderHeaderDF, "SalesOrderID")


In [0]:
# Calculate the total sales per store per day
daily_sales_df = sales_df.groupBy(
'SalesOrderID', 'ModifiedDate'
).agg(
sum('LineTotal').alias('TotalSales')
)



In [0]:
# Define a window partitioned by StoreID and ModifiedDate and ordered by TotalSales descending
window_spec = Window.partitionBy("SalesOrderID", "ModifiedDate").orderBy(daily_sales_df["TotalSales"].desc())


In [0]:
# Apply the rank function
ranked_sales_df = daily_sales_df.withColumn("SalesRank", rank().over(window_spec))


In [0]:
# Filter to get only the top-ranked sales per store each day
highest_sales_df = ranked_sales_df.filter(ranked_sales_df["SalesRank"] == 1)


In [0]:
# Show the result
highest_sales_df.orderBy("SalesOrderID", "ModifiedDate").show()


+------------+-------------------+------------------+---------+
|SalesOrderID|       ModifiedDate|        TotalSales|SalesRank|
+------------+-------------------+------------------+---------+
|       43659|2011-06-07 00:00:00|        20565.6206|        1|
|       43660|2011-06-07 00:00:00|         1294.2529|        1|
|       43661|2011-06-07 00:00:00|        32726.4786|        1|
|       43662|2011-06-07 00:00:00|        28832.5289|        1|
|       43663|2011-06-07 00:00:00|          419.4589|        1|
|       43664|2011-06-07 00:00:00|24432.608799999995|        1|
|       43665|2011-06-07 00:00:00|        14352.7713|        1|
|       43666|2011-06-07 00:00:00|         5056.4896|        1|
|       43667|2011-06-07 00:00:00| 6107.081999999999|        1|
|       43668|2011-06-07 00:00:00|35944.156200000005|        1|
|       43669|2011-06-07 00:00:00|          714.7043|        1|
|       43670|2011-06-07 00:00:00| 6122.081999999999|        1|
|       43671|2011-06-07 00:00:00|      

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, rank
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("HighestSalePerStoreDaily").getOrCreate()
salesOrderDetailDF = spark.read.format("csv").options(header=True).load("/FileStore/tables/Sales_SalesOrderDetail.csv")
salesOrderDetailDF.show()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType

schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("UnitPriceDiscount", DoubleType(), True),
    StructField("LineTotal", DoubleType(), True),
    StructField("rowguid", StringType(), True),
    #StructField("ModifiedDate", TimestampType(), True)
])

salesOrderDetailDF = spark.read.format("csv").schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail.csv")
salesOrderDetailDF.show()

# Load the SalesOrderHeader DataFrame
salesOrderHeaderDF = spark.read.format("csv").option("header", True).load("/FileStore/tables/Sales_SalesOrderHeader.csv")
salesOrderHeaderDF.show()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType

schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("RevisionNumber", IntegerType(), True),
    StructField("OrderDate", TimestampType(), True),
    StructField("DueDate", TimestampType(), True),
    StructField("ShipDate", TimestampType(), True),
    StructField("Status", IntegerType(), True),
    StructField("OnlineOrderFlag", IntegerType(), True),
    StructField("SalesOrderNumber", StringType(), True),
    StructField("PurchaseOrderNumber", StringType(), True),
    StructField("AccountNumber", StringType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("SalesPersonID", IntegerType(), True),
    StructField("TerritoryID", IntegerType(), True),
    StructField("BillToAddressID", IntegerType(), True),
    StructField("ShipToAddressID", IntegerType(), True),
    StructField("ShipMethodID", IntegerType(), True),
    StructField("CreditCardID", IntegerType(), True),
    StructField("CreditCardApprovalCode", StringType(), True),
    StructField("CurrencyRateID", IntegerType(), True),
    StructField("SubTotal", DoubleType(), True),
    StructField("TaxAmt", DoubleType(), True),
    StructField("Freight", DoubleType(), True),
    StructField("TotalDue", DoubleType(), True),
    StructField("Comment", StringType(), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", TimestampType(), True)
])

# Example of reading the CSV file with the schema
salesOrderHeaderDF = spark.read.format("csv").schema(schema).load("/FileStore/tables/Sales_SalesOrderHeader.csv")
salesOrderHeaderDF.show()
sales_df = salesOrderDetailDF.join(salesOrderHeaderDF, "SalesOrderID")

# Calculate the total sales per store per day
daily_sales_df = sales_df.groupBy(
    "SalesOrderID", "ModifiedDate"
).agg(
    sum("LineTotal").alias("TotalSales")
)

# Define a window partitioned by StoreID and ModifiedDate and ordered by TotalSales descending
window_spec = Window.partitionBy("SalesOrderID", "ModifiedDate").orderBy(daily_sales_df["TotalSales"].desc())

ranked_sales_df = daily_sales_df.withColumn("SalesRank", rank().over(window_spec))
highest_sales_df = ranked_sales_df.filter(ranked_sales_df["SalesRank"] == 1)
highest_sales_df.orderBy("SalesOrderID", "ModifiedDate").show()


+-----+---+------------+---+---+---+--------+----+-----------+------------------------------------+-----------------------+
|43659| 11|4911-403C-98| 13|776| 15|2024.994|0.00|2024.994000|B207C96D-D9E6-402B-8470-2CC176C42283|2011-05-31 00:00:00.000|
+-----+---+------------+---+---+---+--------+----+-----------+------------------------------------+-----------------------+
|43659|  2|4911-403C-98|  3|777|  1|2024.994|0.00|6074.982000|                7ABB600D-1E77-41B...|   2011-05-31 00:00:...|
|43659|  3|4911-403C-98|  1|778|  1|2024.994|0.00|2024.994000|                475CF8C6-49F6-486...|   2011-05-31 00:00:...|
|43659|  4|4911-403C-98|  1|771|  1|2039.994|0.00|2039.994000|                04C4DE91-5815-45D...|   2011-05-31 00:00:...|
|43659|  5|4911-403C-98|  1|772|  1|2039.994|0.00|2039.994000|                5A74C7D2-E641-438...|   2011-05-31 00:00:...|
|43659|  6|4911-403C-98|  2|773|  1|2039.994|0.00|4079.988000|                CE472532-A4C0-45B...|   2011-05-31 00:00:...|
|43659| 

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, DateType

# Define the schema
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DecimalType(10, 2), True),
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),
    StructField("LineTotal", DecimalType(20, 2), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", DateType(), True)
])

# Load the data with the schema
data = spark.read.csv("/FileStore/tables/Sales_SalesOrderDetail.csv", schema=schema, header=False)

# Show the DataFrame
data.show()

data.createOrReplaceTempView("sales_data")

query_data = """
SELECT SalesOrderID, SalesOrderDetailID, CarrierTrackingNumber, OrderQty, ProductID,
       SpecialOfferID, UnitPrice, UnitPriceDiscount, LineTotal, rowguid, ModifiedDate
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY ModifiedDate ORDER BY LineTotal DESC) as row_num
    FROM sales_data
) tmp
WHERE row_num = 1
ORDER BY ModifiedDate
"""

# Execute the SQL query
result_df_Data = spark.sql(query_data)

# Show the result
result_df_Data.show()

+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|ModifiedDate|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|       43659|                 1|         4911-403C-98|       1|      776|             1|  2024.99|             0.00|  2024.99|B207C96D-D9E6-402...|  2011-05-31|
|       43659|                 2|         4911-403C-98|       3|      777|             1|  2024.99|             0.00|  6074.98|7ABB600D-1E77-41B...|  2011-05-31|
|       43659|                 3|         4911-403C-98|       1|      778|             1|  2024.99|             0.00|  2024.99|475CF8C6-49F6-486...|  2011-05-31|
|       43659|              