In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("ConsecutiveWorkingDays").getOrCreate()

# Sample data
data = [
    (1, '2024-10-02'), (1, '2024-10-03'), (1, '2024-10-04'), # Consecutive (Tue, Wed, Thu)
    (1, '2024-10-07'), (1, '2024-10-08'), # Consecutive (Mon, Tue)
    (2, '2024-10-01'), (2, '2024-10-03'), # Non-consecutive for employee 2
    (2, '2024-10-04'), (2, '2024-10-07'), (2, '2024-10-08') # Consecutive (Mon, Tue)
]

columns = ["employee_id", "logindate"]
df = spark.createDataFrame(data, columns)

# Convert `logindate` to date format
df = df.withColumn("logindate", F.to_date("logindate", "yyyy-MM-dd"))

# Create a temporary table for Spark SQL
df.createOrReplaceTempView("employee_logins")
df.show()


+-----------+----------+
|employee_id| logindate|
+-----------+----------+
|          1|2024-10-02|
|          1|2024-10-03|
|          1|2024-10-04|
|          1|2024-10-07|
|          1|2024-10-08|
|          2|2024-10-01|
|          2|2024-10-03|
|          2|2024-10-04|
|          2|2024-10-07|
|          2|2024-10-08|
+-----------+----------+



# Find 'n' Consecutive Days Excluding Weekends Using Spark SQL

In [6]:
res_sql = spark.sql("""
SELECT employee_id, 
       MIN(logindate) AS start_date, 
       MAX(logindate) AS end_date, 
       COUNT(logindate) AS consecutive_days
FROM (
    SELECT employee_id,
           logindate,
           ROW_NUMBER() OVER (PARTITION BY employee_id ORDER BY logindate) -
           DATEDIFF(logindate, '2024-01-01') AS streak_id
    FROM employee_logins
    WHERE DAYOFWEEK(logindate) NOT IN (1, 7) -- Exclude weekends
) AS streak_table
GROUP BY employee_id, streak_id
HAVING COUNT(logindate) >= 3 -- Only include streaks with 3 or more consecutive days
ORDER BY employee_id, start_date
""")

res_sql.show()


+-----------+----------+----------+----------------+
|employee_id|start_date|  end_date|consecutive_days|
+-----------+----------+----------+----------------+
|          1|2024-10-02|2024-10-04|               3|
+-----------+----------+----------+----------------+



In [5]:
# Define a window ordered by employee_id and logindate
window_spec = Window.partitionBy("employee_id").orderBy("logindate")

# Calculate the previous logindate and day difference
df = df.withColumn("prev_logindate", F.lag("logindate").over(window_spec))
df = df.withColumn("day_diff", F.datediff("logindate", "prev_logindate"))

# Filter out weekends
df = df.filter(F.dayofweek("logindate").isin([2, 3, 4, 5, 6]))  # Exclude Saturday and Sunday (Sunday=1, Saturday=7)

# Generate a streak ID based on day difference
df = df.withColumn("streak_id", F.sum(F.when((F.col("day_diff") == 1), 0).otherwise(1)).over(window_spec))

# Group by streak ID and count consecutive days
df_streaks = df.groupBy("employee_id", "streak_id").agg(
    F.count("logindate").alias("consecutive_days"),
    F.min("logindate").alias("start_date"),
    F.max("logindate").alias("end_date")
).filter("consecutive_days >= 3")

df_streaks.show()


+-----------+---------+----------------+----------+----------+
|employee_id|streak_id|consecutive_days|start_date|  end_date|
+-----------+---------+----------------+----------+----------+
|          1|        1|               3|2024-10-02|2024-10-04|
+-----------+---------+----------------+----------+----------+

