In [0]:
# /FileStore/tables/Sales_SalesOrderDetail-4.csv

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, DateType
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AzureFunctionDateProcessing").getOrCreate()

# Define the schema
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DecimalType(10, 2), True),
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),
    StructField("LineTotal", DecimalType(20, 2), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", DateType(), True)
])

# Load the data with the schema
df = spark.read.format('csv').option('header', 'false').schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail-4.csv")

# Show few rows to verify data loading
df.show(10)

from pyspark.sql.functions import col, from_utc_timestamp, to_utc_timestamp

# Convert ModifiedDate to IST and explicitly to UTC
df_with_timezones = df.withColumn("ModifiedDate_IST", from_utc_timestamp("ModifiedDate", "Asia/Kolkata")) \
                      .withColumn("ModifiedDate_UTC", to_utc_timestamp("ModifiedDate", "UTC"))

# Show the results with SalesOrderID, SalesOrderDetailID, and modified date columns
df_with_timezones.select("SalesOrderID", "SalesOrderDetailID", "ModifiedDate", "ModifiedDate_UTC", "ModifiedDate_IST").distinct().show(truncate=False)



# Convert ModifiedDate to IST and UTC
#df_with_timezones = df.withColumn("ModifiedDate_IST", from_utc_timestamp("ModifiedDate", "Asia/Kolkata")) \
                      .withColumn("ModifiedDate_UTC", to_utc_timestamp("ModifiedDate", "UTC"))

# Group by ModifiedDate and select distinct entries
df_grouped = df_with_timezones.groupBy("ModifiedDate") \
    .agg({"SalesOrderID": "count"}) \
    .withColumnRenamed("count(SalesOrderID)", "OrderCount") \
    .withColumn("ModifiedDate_IST", from_utc_timestamp("ModifiedDate", "Asia/Kolkata")) \
    .withColumn("ModifiedDate_UTC", to_utc_timestamp("ModifiedDate", "UTC"))

# Show the result grouped by ModifiedDate with timezones
df_grouped.select("ModifiedDate", "ModifiedDate_UTC", "ModifiedDate_IST", "OrderCount").show(truncate=False)








+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|ModifiedDate|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|       43659|                 1|         4911-403C-98|       1|      776|             1|  2024.99|             0.00|  2024.99|B207C96D-D9E6-402...|  2011-05-31|
|       43659|                 2|         4911-403C-98|       3|      777|             1|  2024.99|             0.00|  6074.98|7ABB600D-1E77-41B...|  2011-05-31|
|       43659|                 3|         4911-403C-98|       1|      778|             1|  2024.99|             0.00|  2024.99|475CF8C6-49F6-486...|  2011-05-31|
|       43659|              