In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
import pandas as pd
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
import logging
import pyspark
from delta import *
import json

# Load the configuration JSON file
with open('/usr/local/spark/conf/spark-defaults.json', 'r') as f:
    config = json.load(f)

# Initialize the Spark session builder
builder = pyspark.sql.SparkSession.builder.appName("MyApp1").config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Read the packages from the text file
# packages = []
# with open('/usr/local/spark/conf/packages.txt', 'r') as file:
#     # Read each line and strip newlines or extra spaces
#     packages = [line.strip() for line in file if line.strip()]

# # Add packages to the Spark session configuration
# builder.config("spark.jars.packages", ",".join(packages))

# Apply the configurations from the JSON file to the Spark session
for key, value in config.items():
    builder.config(key, value)

In [None]:

# Configure Spark with Delta Lake (if needed)
spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Now you can use the Spark session
spark

In [5]:
trgt_path_processed = "/mnt/Calendar/Calendar_Parquet/"
trgt_path_csv = "/mnt/Calendar/Calendar_Processed/"

In [6]:
# Create a DataFrame with date range
start_date = "2000-01-01"
end_date = "2050-12-31"

In [7]:
# Create a DataFrame with a single row containing the start and end date
date_range_df = spark.createDataFrame([(start_date, end_date)], ["start_date", "end_date"])

# Generate date sequence
date_sequence_df = date_range_df.select(
    sequence(
        to_date(date_range_df.start_date).alias("start_date"),
        to_date(date_range_df.end_date).alias("end_date")
    ).alias("date")
)

In [8]:
# Explode the sequence into separate rows
df_date = date_sequence_df.selectExpr("explode(date) as date")

In [9]:
df_output = df_date.withColumn("DateSK", regexp_replace("date", "-", "")).withColumn("Year", year("date"))\
    .withColumn("Month",date_format("date","MMMM")).withColumn("Quarter",concat(year("date"), lit(" Q"), quarter("date")))

In [None]:
print(df_output.count())

In [11]:
df_output.createOrReplaceTempView("vw_source")

In [None]:
if DeltaTable.isDeltaTable(spark, trgt_path_processed):
    column_name = df_output.columns
    set_clause = ", ".join([f"target.{i} = source.{i}" for i in column_name])
    query = f"""MERGE INTO delta.`{trgt_path_processed}` AS target USING vw_source AS source ON target.DateSK = source.DateSK WHEN MATCHED THEN UPDATE SET {set_clause}"""
else:
    query=f"""CREATE TABLE delta.`{trgt_path_processed}` USING DELTA AS SELECT * FROM vw_source"""
print(query)
spark.sql(query)

In [13]:
# Save the DataFrame to a CSV file
spark.read.format("delta").load(trgt_path_processed) \
    .coalesce(1) \
    .write.format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(trgt_path_csv)

In [14]:
spark.stop()