In [2]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())

from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (SparkSession.builder .appName("OptimizedLocalSpark") .getOrCreate())
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from datetime import datetime, timedelta

# Initialize Spark session
spark = SparkSession.builder.appName("FindSundays").getOrCreate()

# Sample data
data = [("2023-01-01", "2023-01-15"), ("2023-02-01", "2023-02-28"), ("2023-03-01", "2023-03-15")]
columns = ["StartDate", "EndDate"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

df.show()





+----------+----------+
| StartDate|   EndDate|
+----------+----------+
|2023-01-01|2023-01-15|
|2023-02-01|2023-02-28|
|2023-03-01|2023-03-15|
+----------+----------+



In [4]:

# Define UDF to calculate number of Sundays
def count_sundays(start_date, end_date):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    count = 0
    while start <= end:
        if start.weekday() == 6:  # Sunday is represented by 6
            count += 1
        start += timedelta(days=1)
    return count

count_sundays_udf = udf(count_sundays, IntegerType())

# Apply UDF to DataFrame
df = df.withColumn("NumberOfSundays", count_sundays_udf(col("StartDate"), col("EndDate")))
df.show()

+----------+----------+---------------+
| StartDate|   EndDate|NumberOfSundays|
+----------+----------+---------------+
|2023-01-01|2023-01-15|              3|
|2023-02-01|2023-02-28|              4|
|2023-03-01|2023-03-15|              2|
+----------+----------+---------------+



In [5]:
from pyspark.sql.functions import expr

# Add real-time data to DataFrame
df.createOrReplaceTempView("date_range")

# Use Spark SQL to calculate number of Sundays
result = spark.sql("""
    SELECT 
        StartDate, 
        EndDate,
        DATEDIFF(EndDate, StartDate) AS TotalDays,
        FLOOR((DATEDIFF(EndDate, StartDate) + 1) / 7) AS NumberOfSundays
    FROM date_range
""")

result.show()


+----------+----------+---------+---------------+
| StartDate|   EndDate|TotalDays|NumberOfSundays|
+----------+----------+---------+---------------+
|2023-01-01|2023-01-15|       14|              2|
|2023-02-01|2023-02-28|       27|              4|
|2023-03-01|2023-03-15|       14|              2|
+----------+----------+---------+---------------+



In [6]:
from pyspark.sql.functions import col, explode, sequence, lit, dayofweek

# Create DataFrame with date range
df = df.withColumn(
    "DateRange",
    sequence(col("StartDate").cast("date"), col("EndDate").cast("date"))
)

# Explode date range into individual dates
df = df.withColumn("IndividualDate", explode(col("DateRange")))

# Filter Sundays and count them
df_sundays = df.filter(dayofweek(col("IndividualDate")) == 1)  # Sunday is 1 in Spark SQL
result = df_sundays.groupBy("StartDate", "EndDate").count().withColumnRenamed("count", "NumberOfSundays")

result.show()


+----------+----------+---------------+
| StartDate|   EndDate|NumberOfSundays|
+----------+----------+---------------+
|2023-01-01|2023-01-15|              3|
|2023-02-01|2023-02-28|              4|
|2023-03-01|2023-03-15|              2|
+----------+----------+---------------+



In [7]:
from datetime import datetime, timedelta

def count_sundays(start_date, end_date):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    count = 0
    while start <= end:
        if start.weekday() == 6:  # Sunday is represented by 6
            count += 1
        start += timedelta(days=1)
    return count

# Example usage
start_date = "2023-01-01"
end_date = "2023-01-15"
print("Number of Sundays:", count_sundays(start_date, end_date))


Number of Sundays: 3
