In [0]:
## /FileStore/tables/Sales_SalesOrderDetail-5.csv

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_utc_timestamp, from_utc_timestamp, lit, dayofyear, when

In [0]:
# Initialize the Spark session
spark = SparkSession.builder.appName("Timezone Conversion").getOrCreate()

In [0]:
# Schema for CSV data
schema = """
SalesOrderID INT,
SalesOrderDetailID INT,
CarrierTrackingNumber STRING,
OrderQty INT,
ProductID INT,
SpecialOfferID INT,
UnitPrice DOUBLE,
UnitPriceDiscount DOUBLE,
LineTotal DOUBLE,
rowguid STRING,
ModifiedDate TIMESTAMP
"""

# Load the data with the schema
df = spark.read.format('csv').option('header', 'false').schema(schema).load("/FileStore/tables/Sales_SalesOrderDetail-5.csv")

In [0]:
df_with_timezone = df.withColumn("UTC", to_utc_timestamp(col("ModifiedDate"), "UTC"))

# Static list of European time zones with daylight saving details
time_zone_table = [
    {"zone_id": "Europe/London", "local_time_with_dst": "BST", "local_time_without_dst": "GMT"},
    {"zone_id": "Europe/Berlin", "local_time_with_dst": "CEST", "local_time_without_dst": "CET"},
    {"zone_id": "Europe/Paris", "local_time_with_dst": "CEST", "local_time_without_dst": "CET"},
    {"zone_id": "Europe/Madrid", "local_time_with_dst": "CEST", "local_time_without_dst": "CET"},
    {"zone_id": "Europe/Rome", "local_time_with_dst": "CEST", "local_time_without_dst": "CET"}
]

# Define the conversion logic and calculate local time for different zones
df_with_timezone = df_with_timezone \
    .withColumn("Local_Time_Without_DST", from_utc_timestamp(col("ModifiedDate"), "Europe/London")) \
    .withColumn("Local_Time_With_DST", from_utc_timestamp(col("ModifiedDate"), "Europe/Berlin"))

# Determine if daylight saving is on or off (assuming DST between day 60 and day 300 of the year for Europe)
df_with_timezone = df_with_timezone \
    .withColumn("Day_Of_Year", dayofyear(col("ModifiedDate"))) \
    .withColumn("Is_DST_On", when((col("Day_Of_Year") >= 60) & (col("Day_Of_Year") <= 300), lit("Yes")).otherwise(lit("No")))

# Show the final DataFrame
df_with_timezone.show(truncate=False)

+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+------------------------------------+-------------------+-------------------+----------------------+-------------------+-----------+---------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|rowguid                             |ModifiedDate       |UTC                |Local_Time_Without_DST|Local_Time_With_DST|Day_Of_Year|Is_DST_On|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+------------------------------------+-------------------+-------------------+----------------------+-------------------+-----------+---------+
|43659       |1                 |4911-403C-98         |1       |776      |1             |2024.994 |0.0              |2024.994 |B207C96D-D9E6-402B-8470-2CC176C42283|2011-05-31 00:00:00|2011-0

In [0]:
df_with_timezone.show(10)

+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+-------------------+-------------------+----------------------+-------------------+-----------+---------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|       ModifiedDate|                UTC|Local_Time_Without_DST|Local_Time_With_DST|Day_Of_Year|Is_DST_On|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+-------------------+-------------------+----------------------+-------------------+-----------+---------+
|       43659|                 1|         4911-403C-98|       1|      776|             1| 2024.994|              0.0| 2024.994|B207C96D-D9E6-402...|2011-05-31 00:00:00|2011-05-31 00:00:00|   2011-05-31 01:00:00|2011-05-31 02:00:00|       

In [0]:
#df_with_timeZone.show("Local_Time_Without_DST","Local_Time_With_DST","Day_Of_Year","Is_DST_On")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month

# Initialize the Spark session
spark = SparkSession.builder.appName("Sales Data Filter by Year and Month").getOrCreate()

# Schema for the CSV data
schema = """
SalesOrderID INT,
SalesOrderDetailID INT,
CarrierTrackingNumber STRING,
OrderQty INT,
ProductID INT,
SpecialOfferID INT,
UnitPrice DOUBLE,
UnitPriceDiscount DOUBLE,
LineTotal DOUBLE,
rowguid STRING,
ModifiedDate TIMESTAMP
"""

# Load the CSV data (use the correct path for your dataset)
csv_file_path = "/FileStore/tables/Sales_SalesOrderDetail-5.csv"
df = spark.read.format("csv").option("header", "true").schema(schema).load(csv_file_path)

# Convert 'ModifiedDate' to timestamp if it's not already in the correct format
df = df.withColumn("ModifiedDate", col("ModifiedDate").cast("timestamp"))

# Extract year and month from 'ModifiedDate'
df = df.withColumn("Year", year(col("ModifiedDate"))).withColumn("Month", month(col("ModifiedDate")))

# Show the dataframe with Year and Month columns
df.show()

# Get distinct years and months in the data
distinct_years_months = df.select("Year", "Month").distinct().collect()

# Loop through the distinct years and months in the data
for row in distinct_years_months:
    year_val = row["Year"]
    month_val = row["Month"]
    
    # Filter data for the specific year and month
    df_filtered = df.filter((col("Year") == year_val) & (col("Month") == month_val))
    
    # Show the filtered data for each month
    print(f"Data for {year_val}-{month_val}")
    df_filtered.show()
    
    # Save each month's data to a new CSV (path can be adjusted)
    output_path = f"/mnt/data/sales_data_{year_val}_{month_val}.csv"
    df_filtered.write.csv(output_path, header=True, mode="overwrite")

+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+-------------------+----+-----+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|       ModifiedDate|Year|Month|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+---------+--------------------+-------------------+----+-----+
|       43659|                 2|         4911-403C-98|       3|      777|             1| 2024.994|              0.0| 6074.982|7ABB600D-1E77-41B...|2011-05-31 00:00:00|2011|    5|
|       43659|                 3|         4911-403C-98|       1|      778|             1| 2024.994|              0.0| 2024.994|475CF8C6-49F6-486...|2011-05-31 00:00:00|2011|    5|
|       43659|                 4|         4911-403C-98|       1|      771|             1| 2039.994| 