In [0]:

# /FileStore/tables/Sales_SalesOrderDetail-2.csv


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, DateType
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AzureFunctionDateProcessing").getOrCreate()

# Define the schema
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DecimalType(10, 2), True),
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),
    StructField("LineTotal", DecimalType(20, 2), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", DateType(), True)
])

# Load the data with the schema
df = spark.read.format('csv').option('header', 'false').schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail-2.csv")

# Show few rows to verify data loading
df.show(10)






+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|ModifiedDate|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|       43659|                 1|         4911-403C-98|       1|      776|             1|  2024.99|             0.00|  2024.99|B207C96D-D9E6-402...|  2011-05-31|
|       43659|                 2|         4911-403C-98|       3|      777|             1|  2024.99|             0.00|  6074.98|7ABB600D-1E77-41B...|  2011-05-31|
|       43659|                 3|         4911-403C-98|       1|      778|             1|  2024.99|             0.00|  2024.99|475CF8C6-49F6-486...|  2011-05-31|
|       43659|              

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, DateType
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("SalesDataProcessing").getOrCreate()

# Define the schema
schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("SalesOrderDetailID", IntegerType(), True),
    StructField("CarrierTrackingNumber", StringType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("SpecialOfferID", IntegerType(), True),
    StructField("UnitPrice", DecimalType(10, 2), True),
    StructField("UnitPriceDiscount", DecimalType(10, 2), True),
    StructField("LineTotal", DecimalType(20, 2), True),
    StructField("rowguid", StringType(), True),
    StructField("ModifiedDate", DateType(), True)
])

# Load the data with the schema
#df = spark.read.format('csv').option('header', 'false').schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail.csv")
df = spark.read.format('csv').option('header', 'false').schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail-2.csv")

# Show the first few rows to verify the data
df.show(5)

# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("SalesData")

# Extract distinct years
years_df = spark.sql("SELECT DISTINCT YEAR(ModifiedDate) AS OrderYear FROM SalesData")
years = years_df.collect()
years_list = [row["OrderYear"] for row in years]

for y in years_list:
    # Define and execute the query
    query = f"""
    SELECT *, 
           YEAR(ModifiedDate) AS OrderYear, 
           DATE_SUB(ModifiedDate, 365) AS PreviousYearDate
    FROM SalesData
    WHERE YEAR(ModifiedDate) = {y}
    """
    
    df_year = spark.sql(query)
    
    print(df_year)
    # Save the results to a Delta table
    df_year.write.mode("overwrite").format("delta").saveAsTable(f"default.sales_data2_{y}")

+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|ModifiedDate|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+------------+
|       43659|                 1|         4911-403C-98|       1|      776|             1|  2024.99|             0.00|  2024.99|B207C96D-D9E6-402...|  2011-05-31|
|       43659|                 2|         4911-403C-98|       3|      777|             1|  2024.99|             0.00|  6074.98|7ABB600D-1E77-41B...|  2011-05-31|
|       43659|                 3|         4911-403C-98|       1|      778|             1|  2024.99|             0.00|  2024.99|475CF8C6-49F6-486...|  2011-05-31|
|       43659|              

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_utc_timestamp, from_utc_timestamp

# Create Spark session (skip if already created)
spark = SparkSession.builder.appName("TimezoneConversion").getOrCreate()

# Load the data into a DataFrame (assuming a CSV file in Databricks)
#df = spark.read.option("header", True).csv("/mnt/data/Sales.SalesOrderDetail.csv")

# Convert 'ModifiedDate' column to timestamp (adjust the column name if necessary)
df = df.withColumn("ModifiedDate", col("ModifiedDate").cast("timestamp"))

# Convert to UTC
df = df.withColumn("ModifiedDate_UTC", to_utc_timestamp(col("ModifiedDate"), "UTC"))

# Convert to IST (Indian Standard Time)
df = df.withColumn("ModifiedDate_IST", from_utc_timestamp(col("ModifiedDate_UTC"), "Asia/Kolkata"))

# Show the updated DataFrame
df.select("ModifiedDate", "ModifiedDate_UTC", "ModifiedDate_IST").show(6)


+-------------------+-------------------+-------------------+
|       ModifiedDate|   ModifiedDate_UTC|   ModifiedDate_IST|
+-------------------+-------------------+-------------------+
|2011-05-31 00:00:00|2011-05-31 00:00:00|2011-05-31 05:30:00|
|2011-05-31 00:00:00|2011-05-31 00:00:00|2011-05-31 05:30:00|
|2011-05-31 00:00:00|2011-05-31 00:00:00|2011-05-31 05:30:00|
|2011-05-31 00:00:00|2011-05-31 00:00:00|2011-05-31 05:30:00|
|2011-05-31 00:00:00|2011-05-31 00:00:00|2011-05-31 05:30:00|
|2011-05-31 00:00:00|2011-05-31 00:00:00|2011-05-31 05:30:00|
+-------------------+-------------------+-------------------+
only showing top 6 rows

