In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())

from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (SparkSession.builder .appName("OptimizedLocalSpark") .getOrCreate())
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


## start and end day of the week

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_add, date_format, dayofweek

# Initialize Spark session
spark = SparkSession.builder.appName("StartEndOfWeek").getOrCreate()

# Sample data
data = [("2024-11-15",), ("2024-11-10",), ("2024-11-13",)]
columns = ["TransactionDate"]
df = spark.createDataFrame(data, columns)
df.show()

# Calculate Start and End Day of the Week
df = df.withColumn("TransactionDate", col("TransactionDate").cast("date")) \
       .withColumn("DayOfWeek", dayofweek(col("TransactionDate"))) \
       .withColumn("StartOfWeek", date_add(col("TransactionDate"), - (col("DayOfWeek") - 1))) \
       .withColumn("EndOfWeek", date_add(col("TransactionDate"), 7 - col("DayOfWeek")))

df.show()


+---------------+
|TransactionDate|
+---------------+
|     2024-11-15|
|     2024-11-10|
|     2024-11-13|
+---------------+

+---------------+---------+-----------+----------+
|TransactionDate|DayOfWeek|StartOfWeek| EndOfWeek|
+---------------+---------+-----------+----------+
|     2024-11-15|        6| 2024-11-10|2024-11-16|
|     2024-11-10|        1| 2024-11-10|2024-11-16|
|     2024-11-13|        4| 2024-11-10|2024-11-16|
+---------------+---------+-----------+----------+



In [3]:
# Register DataFrame as a SQL temporary view
df.createOrReplaceTempView("transactions")

# SQL query to calculate Start and End Day of the Week
result = spark.sql("""
    SELECT 
        TransactionDate,
        DATE_SUB(TransactionDate, DAYOFWEEK(TransactionDate) - 1) AS StartOfWeek,
        DATE_ADD(TransactionDate, 7 - DAYOFWEEK(TransactionDate)) AS EndOfWeek
    FROM transactions
""")

result.show()


+---------------+-----------+----------+
|TransactionDate|StartOfWeek| EndOfWeek|
+---------------+-----------+----------+
|     2024-11-15| 2024-11-10|2024-11-16|
|     2024-11-10| 2024-11-10|2024-11-16|
|     2024-11-13| 2024-11-10|2024-11-16|
+---------------+-----------+----------+



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from datetime import timedelta

# Initialize Spark session
spark = SparkSession.builder.appName("StartEndOfWeek").getOrCreate()

# Sample data
data = [("2024-11-15",), ("2024-11-10",), ("2024-11-13",)]
columns = ["TransactionDate"]
df = spark.createDataFrame(data, columns)

# Ensure the TransactionDate column is of DateType
df = df.withColumn("TransactionDate", col("TransactionDate").cast("date"))

# UDFs to calculate start and end of the week
def get_start_of_week(date_obj):
    # date_obj is already a datetime.date object
    start_of_week = date_obj - timedelta(days=date_obj.weekday())  # Monday as start
    return start_of_week

def get_end_of_week(date_obj):
    # date_obj is already a datetime.date object
    end_of_week = date_obj + timedelta(days=(6 - date_obj.weekday()))  # Sunday as end
    return end_of_week

start_udf = udf(get_start_of_week, DateType())
end_udf = udf(get_end_of_week, DateType())

# Apply UDFs
df = df.withColumn("StartOfWeek", start_udf(col("TransactionDate"))) \
       .withColumn("EndOfWeek", end_udf(col("TransactionDate")))

df.show()



+---------------+-----------+----------+
|TransactionDate|StartOfWeek| EndOfWeek|
+---------------+-----------+----------+
|     2024-11-15| 2024-11-11|2024-11-17|
|     2024-11-10| 2024-11-04|2024-11-10|
|     2024-11-13| 2024-11-11|2024-11-17|
+---------------+-----------+----------+

