In [0]:
from pyspark.sql import SparkSession

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DecimalType, DateType, DoubleType


In [0]:
spark  = SparkSession.builder.appName("sql_func").getOrCreate()

In [0]:
# Define the schema for the DataFrame
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),          # Assuming integer for order ID
    StructField("SalesOrderDetailID", IntegerType(), True),    # Assuming integer for detail ID
    StructField("CarrierTrackingNumber", StringType(), True),  # String type for tracking numbers
    StructField("OrderQty", IntegerType(), True),              # Integer type for order quantity
    StructField("ProductID", IntegerType(), True),             # Integer type for product ID
    StructField("SpecialOfferID", IntegerType(), True),        # Integer type for special offer ID
    StructField("UnitPrice", DecimalType(10, 2), True),        # Decimal type for unit price, precision 10, scale 2
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),# Decimal type for unit price discount, precision 10, scale 2
    StructField("LineTotal", DecimalType(15, 2), True),        # Decimal type for line total, precision 15, scale 2
    StructField("rowguid", StringType(), True),                # String type for GUID, typically stored as text
    StructField("ModifiedDate", DateType(), True)              # Date type for modified date
])


In [0]:
# Define the schema for the DataFrame with ModifiedDate as DateType
schema2 = StructType([
    StructField("SalesOrderID", IntegerType(), True),            # Integer for Sales Order ID
    StructField("RevisionNumber", IntegerType(), True),          # Integer for Revision Number
    StructField("OrderDate", StringType(), True),                # String for Order Date (could be DateType if dates)
    StructField("DueDate", StringType(), True),                  # String for Due Date (could be DateType)
    StructField("ShipDate", StringType(), True),                 # String for Ship Date (could be DateType)
    StructField("Status", IntegerType(), True),                  # Integer for Status
    StructField("OnlineOrderFlag", IntegerType(), True),         # Integer for Online Order Flag
    StructField("SalesOrderNumber", StringType(), True),         # String for Sales Order Number
    StructField("PurchaseOrderNumber", StringType(), True),      # String for Purchase Order Number
    StructField("AccountNumber", StringType(), True),            # String for Account Number
    StructField("CustomerID", IntegerType(), True),              # Integer for Customer ID
    StructField("SalesPersonID", StringType(), True),            # String for Sales Person ID (could be integer if numeric)
    StructField("TerritoryID", IntegerType(), True),             # Integer for Territory ID
    StructField("BillToAddressID", IntegerType(), True),         # Integer for Bill To Address ID
    StructField("ShipToAddressID", IntegerType(), True),         # Integer for Ship To Address ID
    StructField("ShipMethodID", IntegerType(), True),            # Integer for Ship Method ID
    StructField("CreditCardID", StringType(), True),             # String for Credit Card ID (could be integer if numeric)
    StructField("CreditCardApprovalCode", StringType(), True),   # String for Credit Card Approval Code
    StructField("CurrencyRateID", StringType(), True),           # String for Currency Rate ID (could be integer if numeric)
    StructField("SubTotal", DoubleType(), True),                 # Double for SubTotal
    StructField("TaxAmt", DoubleType(), True),                   # Double for Tax Amount
    StructField("Freight", DoubleType(), True),                  # Double for Freight
    StructField("TotalDue", DoubleType(), True),                 # Double for Total Due
    StructField("Comment", StringType(), True),                  # String for Comment
    StructField("rowguid", StringType(), True),                  # String for GUID
    StructField("ModifiedDate", DateType(), True)                # DateType for Modified Date
])


In [0]:
sales = spark.read.format('csv').option('header','False').schema(schema).load("dbfs:/FileStore/Sales_SalesOrderDetail.csv") 

In [0]:
order_head = spark.read.format("csv").option('header','true').schema(schema2).load('dbfs:/FileStore/Book1.csv')

In [0]:
sales.display()

SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,UnitPriceDiscount,LineTotal,rowguid,ModifiedDate
43659,1,4911-403C-98,1,776,1,2024.99,0.0,2024.99,B207C96D-D9E6-402B-8470-2CC176C42283,2011-05-31
43659,2,4911-403C-98,3,777,1,2024.99,0.0,6074.98,7ABB600D-1E77-41BE-9FE5-B9142CFC08FA,2011-05-31
43659,3,4911-403C-98,1,778,1,2024.99,0.0,2024.99,475CF8C6-49F6-486E-B0AD-AFC6A50CDD2F,2011-05-31
43659,4,4911-403C-98,1,771,1,2039.99,0.0,2039.99,04C4DE91-5815-45D6-8670-F462719FBCE3,2011-05-31
43659,5,4911-403C-98,1,772,1,2039.99,0.0,2039.99,5A74C7D2-E641-438E-A7AC-37BF23280301,2011-05-31
43659,6,4911-403C-98,2,773,1,2039.99,0.0,4079.99,CE472532-A4C0-45BA-816E-EEFD3FD848B3,2011-05-31
43659,7,4911-403C-98,1,774,1,2039.99,0.0,2039.99,80667840-F962-4EE3-96E0-AECA108E0D4F,2011-05-31
43659,8,4911-403C-98,3,714,1,28.84,0.0,86.52,E9D54907-E7B7-4969-80D9-76BA69F8A836,2011-05-31
43659,9,4911-403C-98,1,716,1,28.84,0.0,28.84,AA542630-BDCD-4CE5-89A0-C1BF82747725,2011-05-31
43659,10,4911-403C-98,6,709,1,5.7,0.0,34.2,AC769034-3C2F-495C-A5A7-3B71CDB25D4E,2011-05-31


In [0]:
sales.createOrReplaceTempView('Order_detail')
order_head.createOrReplaceTempView('Header_detail')

In [0]:
%sql
WITH DailySales AS (
SELECT
SalesOrderID,
ModifiedDate,
SUM(LineTotal) AS TotalSales
FROM
Order_detail SOD
GROUP BY
SOD.SalesOrderID, SOD.ModifiedDate
), 
rsales as (
select *, dense_rank()over(PARTITION BY ModifiedDate order by TotalSales desc) as rk FROM DailySales
ORDER BY SalesOrderID, rk 
)
select * FROM rsales
where rk = 1 


SalesOrderID,ModifiedDate,TotalSales,rk
43683,2011-05-31,42813.43,1
43702,2011-06-01,3578.27,1
43703,2011-06-01,3578.27,1
43706,2011-06-02,3578.27,1
43707,2011-06-02,3578.27,1
43709,2011-06-02,3578.27,1
43710,2011-06-02,3578.27,1
43711,2011-06-03,3578.27,1
43712,2011-06-03,3578.27,1
43713,2011-06-04,3578.27,1


In [0]:
from pyspark.sql.functions import sum, rank
from pyspark.sql.window import Window

# Assuming salesOrderDetailDF is already loaded and salesOrderHeaderDF is available
# Joining SalesOrderDetail with SalesOrderHeader to get the ModifiedDate and StoreID
# Calculate the total sales per store per day
daily_sales_df = sales.groupBy(
'SalesOrderID', 'ModifiedDate'
).agg(
sum('LineTotal').alias('TotalSales')
)
# Define a window partitioned by StoreID and ModifiedDate and ordered by TotalSales descending
window_spec = Window.partitionBy('SalesOrderID',
'ModifiedDate').orderBy(daily_sales_df['TotalSales'].desc())
# Apply the rank function
ranked_sales_df = daily_sales_df.withColumn('SalesRank', rank().over(window_spec))
# Filter to get only the top-ranked sales per store each day
highest_sales_df = ranked_sales_df.filter(ranked_sales_df['SalesRank'] == 1)
# Show the result
highest_sales_df.orderBy('SalesOrderID', 'ModifiedDate').display()

SalesOrderID,ModifiedDate,TotalSales,SalesRank
43659,2011-05-31,20565.6,1
43660,2011-05-31,1294.25,1
43661,2011-05-31,32726.48,1
43662,2011-05-31,28832.53,1
43663,2011-05-31,419.46,1
43664,2011-05-31,24432.6,1
43665,2011-05-31,14352.76,1
43666,2011-05-31,5056.49,1
43667,2011-05-31,6107.07,1
43668,2011-05-31,35944.15,1


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, rank
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DecimalType, DateType, DoubleType

spark  = SparkSession.builder.appName("sql_func").getOrCreate()


# Define the schema for the DataFrame
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),          # Assuming integer for order ID
    StructField("SalesOrderDetailID", IntegerType(), True),    # Assuming integer for detail ID
    StructField("CarrierTrackingNumber", StringType(), True),  # String type for tracking numbers
    StructField("OrderQty", IntegerType(), True),              # Integer type for order quantity
    StructField("ProductID", IntegerType(), True),             # Integer type for product ID
    StructField("SpecialOfferID", IntegerType(), True),        # Integer type for special offer ID
    StructField("UnitPrice", DecimalType(10, 2), True),        # Decimal type for unit price, precision 10, scale 2
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),# Decimal type for unit price discount, precision 10, scale 2
    StructField("LineTotal", DecimalType(15, 2), True),        # Decimal type for line total, precision 15, scale 2
    StructField("rowguid", StringType(), True),                # String type for GUID, typically stored as text
    StructField("ModifiedDate", DateType(), True)              # Date type for modified date
])

sales = spark.read.format('csv').option('header','False').schema(schema).load("dbfs:/FileStore/Sales_SalesOrderDetail.csv") 

sales.createOrReplaceTempView('Order_detail')

%sql
WITH DailySales AS (
SELECT
SalesOrderID,
ModifiedDate,
SUM(LineTotal) AS TotalSales
FROM
Order_detail SOD
GROUP BY
SOD.SalesOrderID, SOD.ModifiedDate
), 
rsales as (
select *, dense_rank()over(PARTITION BY ModifiedDate order by TotalSales desc) as rk FROM DailySales
ORDER BY SalesOrderID, rk 
)
select * FROM rsales
where rk = 1 


# Assuming salesOrderDetailDF is already loaded and salesOrderHeaderDF is available
# Joining SalesOrderDetail with SalesOrderHeader to get the ModifiedDate and StoreID
# Calculate the total sales per store per day
daily_sales_df = sales.groupBy(
'SalesOrderID', 'ModifiedDate'
).agg(
sum('LineTotal').alias('TotalSales')
)
# Define a window partitioned by StoreID and ModifiedDate and ordered by TotalSales descending
window_spec = Window.partitionBy('SalesOrderID',
'ModifiedDate').orderBy(daily_sales_df['TotalSales'].desc())
# Apply the rank function
ranked_sales_df = daily_sales_df.withColumn('SalesRank', rank().over(window_spec))
# Filter to get only the top-ranked sales per store each day
highest_sales_df = ranked_sales_df.filter(ranked_sales_df['SalesRank'] == 1)
# Show the result
highest_sales_df.orderBy('SalesOrderID', 'ModifiedDate').display()