In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("BiweeklyFridays").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("RecordID", IntegerType(), True),
    StructField("TaskName", StringType(), True),
    StructField("StartDate", DateType(), True),
    StructField("EndDate", DateType(), True)
])

# Sample data (RecordID, TaskName, StartDate, EndDate)
data = [
    (1, "Task A", datetime.strptime("2024-01-01", "%Y-%m-%d").date(), datetime.strptime("2024-01-15", "%Y-%m-%d").date()),
    (2, "Task B", datetime.strptime("2024-02-03", "%Y-%m-%d").date(), datetime.strptime("2024-02-17", "%Y-%m-%d").date()),
    (3, "Task C", datetime.strptime("2024-01-10", "%Y-%m-%d").date(), datetime.strptime("2024-01-25", "%Y-%m-%d").date()),
    (4, "Task D", datetime.strptime("2024-03-05", "%Y-%m-%d").date(), datetime.strptime("2024-03-19", "%Y-%m-%d").date()),
    (5, "Task E", datetime.strptime("2024-03-21", "%Y-%m-%d").date(), datetime.strptime("2024-04-04", "%Y-%m-%d").date()),
    (6, "Task F", datetime.strptime("2024-04-08", "%Y-%m-%d").date(), datetime.strptime("2024-04-22", "%Y-%m-%d").date()),
    (7, "Task G", datetime.strptime("2024-05-12", "%Y-%m-%d").date(), datetime.strptime("2024-05-26", "%Y-%m-%d").date()),
    (8, "Task H", datetime.strptime("2024-06-02", "%Y-%m-%d").date(), datetime.strptime("2024-06-16", "%Y-%m-%d").date()),
    (9, "Task I", datetime.strptime("2024-07-01", "%Y-%m-%d").date(), datetime.strptime("2024-07-15", "%Y-%m-%d").date()),
    (10, "Task J", datetime.strptime("2024-07-20", "%Y-%m-%d").date(), datetime.strptime("2024-08-03", "%Y-%m-%d").date()),
    (11, "Task K", datetime.strptime("2024-08-09", "%Y-%m-%d").date(), datetime.strptime("2024-08-23", "%Y-%m-%d").date()),
    (12, "Task L", datetime.strptime("2024-09-01", "%Y-%m-%d").date(), datetime.strptime("2024-09-15", "%Y-%m-%d").date()),
    (13, "Task M", datetime.strptime("2024-10-01", "%Y-%m-%d").date(), datetime.strptime("2024-10-15", "%Y-%m-%d").date()),
    (14, "Task N", datetime.strptime("2024-10-20", "%Y-%m-%d").date(), datetime.strptime("2024-11-03", "%Y-%m-%d").date()),
    (15, "Task O", datetime.strptime("2024-12-05", "%Y-%m-%d").date(), datetime.strptime("2024-12-19", "%Y-%m-%d").date())
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.cache()
df.createOrReplaceTempView("Tasks")
df.show()


+--------+--------+----------+----------+
|RecordID|TaskName| StartDate|   EndDate|
+--------+--------+----------+----------+
|       1|  Task A|2024-01-01|2024-01-15|
|       2|  Task B|2024-02-03|2024-02-17|
|       3|  Task C|2024-01-10|2024-01-25|
|       4|  Task D|2024-03-05|2024-03-19|
|       5|  Task E|2024-03-21|2024-04-04|
|       6|  Task F|2024-04-08|2024-04-22|
|       7|  Task G|2024-05-12|2024-05-26|
|       8|  Task H|2024-06-02|2024-06-16|
|       9|  Task I|2024-07-01|2024-07-15|
|      10|  Task J|2024-07-20|2024-08-03|
|      11|  Task K|2024-08-09|2024-08-23|
|      12|  Task L|2024-09-01|2024-09-15|
|      13|  Task M|2024-10-01|2024-10-15|
|      14|  Task N|2024-10-20|2024-11-03|
|      15|  Task O|2024-12-05|2024-12-19|
+--------+--------+----------+----------+



# Spark SQL

In [9]:
# calculate Biweekly Friday dates in an Year
# Create DataFrame for the first Friday of the year
first_friday = spark.sql("SELECT to_date('2024-01-05') AS BiweeklyFriday")

# Generate Biweekly Fridays Using a Loop in Spark SQL
query = """
SELECT explode(array(
    to_date('2024-01-05'),
    date_add('2024-01-05', 14 * 1),
    date_add('2024-01-05', 14 * 2),
    date_add('2024-01-05', 14 * 3),
    date_add('2024-01-05', 14 * 4),
    date_add('2024-01-05', 14 * 5),
    date_add('2024-01-05', 14 * 6),
    date_add('2024-01-05', 14 * 7),
    date_add('2024-01-05', 14 * 8),
    date_add('2024-01-05', 14 * 9),
    date_add('2024-01-05', 14 * 10),
    date_add('2024-01-05', 14 * 11),
    date_add('2024-01-05', 14 * 12),
    date_add('2024-01-05', 14 * 13),
    date_add('2024-01-05', 14 * 14),
    date_add('2024-01-05', 14 * 15),
    date_add('2024-01-05', 14 * 16),
    date_add('2024-01-05', 14 * 17),
    date_add('2024-01-05', 14 * 18),
    date_add('2024-01-05', 14 * 19),
    date_add('2024-01-05', 14 * 20),
    date_add('2024-01-05', 14 * 21),
    date_add('2024-01-05', 14 * 22),
    date_add('2024-01-05', 14 * 23),
    date_add('2024-01-05', 14 * 24),
    date_add('2024-01-05', 14 * 25)
)) AS BiweeklyFriday
"""

biweekly_fridays_from_loop = spark.sql(query)
biweekly_fridays_from_loop.show()


+--------------+
|BiweeklyFriday|
+--------------+
|    2024-01-05|
|    2024-01-19|
|    2024-02-02|
|    2024-02-16|
|    2024-03-01|
|    2024-03-15|
|    2024-03-29|
|    2024-04-12|
|    2024-04-26|
|    2024-05-10|
|    2024-05-24|
|    2024-06-07|
|    2024-06-21|
|    2024-07-05|
|    2024-07-19|
|    2024-08-02|
|    2024-08-16|
|    2024-08-30|
|    2024-09-13|
|    2024-09-27|
+--------------+
only showing top 20 rows



In [6]:
# Register the DataFrame as a temporary SQL view
df.createOrReplaceTempView("Tasks")

# SQL Query to Generate Biweekly Fridays Starting from First Friday in 2024
query = """
WITH BiweeklyFridays AS (
    SELECT date_add('2024-01-05', (row_number() OVER (ORDER BY (SELECT NULL)) - 1) * 14) AS BiweeklyFriday
    FROM Tasks
    LIMIT 26
)
SELECT BiweeklyFriday FROM BiweeklyFridays
"""

# Execute the query
biweekly_fridays_sql = spark.sql(query)
biweekly_fridays_sql.show()


+--------------+
|BiweeklyFriday|
+--------------+
|    2024-01-05|
|    2024-01-19|
|    2024-02-02|
|    2024-02-16|
|    2024-03-01|
|    2024-03-15|
|    2024-03-29|
|    2024-04-12|
|    2024-04-26|
|    2024-05-10|
|    2024-05-24|
|    2024-06-07|
|    2024-06-21|
|    2024-07-05|
|    2024-07-19|
+--------------+



# Pyspark

In [7]:
from pyspark.sql.functions import expr, sequence, explode

# Step 1: Generate a sequence of biweekly Fridays
start_date = datetime.strptime("2024-01-05", "%Y-%m-%d").date()
end_date = datetime.strptime("2024-12-27", "%Y-%m-%d").date()

# Step 2: Create a DataFrame using the sequence
biweekly_fridays_df = spark.sql(f"SELECT explode(sequence(to_date('{start_date}'), to_date('{end_date}'), interval 14 days)) AS BiweeklyFriday")

# Show the result
biweekly_fridays_df.show()


+--------------+
|BiweeklyFriday|
+--------------+
|    2024-01-05|
|    2024-01-19|
|    2024-02-02|
|    2024-02-16|
|    2024-03-01|
|    2024-03-15|
|    2024-03-29|
|    2024-04-12|
|    2024-04-26|
|    2024-05-10|
|    2024-05-24|
|    2024-06-07|
|    2024-06-21|
|    2024-07-05|
|    2024-07-19|
|    2024-08-02|
|    2024-08-16|
|    2024-08-30|
|    2024-09-13|
|    2024-09-27|
+--------------+
only showing top 20 rows



In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

# Create a base DataFrame with one starting date
start_date_df = spark.createDataFrame([(datetime.strptime("2024-01-05", "%Y-%m-%d").date(),)], ["StartDate"])

# Generate a sequence of row numbers
dates_with_index = start_date_df.withColumn("idx", row_number().over(Window.orderBy("StartDate")))

# Generate biweekly Fridays using a calculated column
biweekly_fridays_window = dates_with_index.withColumn(
    "BiweeklyFriday", expr("date_add(StartDate, (idx - 1) * 14)")
).select("BiweeklyFriday")

# Show the result
biweekly_fridays_window.show()



+--------------+
|BiweeklyFriday|
+--------------+
|    2024-01-05|
+--------------+

