In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())

from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (SparkSession.builder .appName("OptimizedLocalSpark") .getOrCreate())
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("CustomerOrders").getOrCreate()

# Sample data
data = [
    (1, "2024-11-01"), (1, "2024-11-02"), (1, "2024-11-03"), (1, "2024-11-04"),
    (2, "2024-11-01"), (2, "2024-11-02"), (2, "2024-11-04"), (2, "2024-11-05"),
    (3, "2024-11-01"), (3, "2024-11-02"), (3, "2024-11-03"), (3, "2024-11-04"),
]
columns = ["CustomerID", "OrderDate"]

df = spark.createDataFrame(data, columns)
df.show()




+----------+----------+
|CustomerID| OrderDate|
+----------+----------+
|         1|2024-11-01|
|         1|2024-11-02|
|         1|2024-11-03|
|         1|2024-11-04|
|         2|2024-11-01|
|         2|2024-11-02|
|         2|2024-11-04|
|         2|2024-11-05|
|         3|2024-11-01|
|         3|2024-11-02|
|         3|2024-11-03|
|         3|2024-11-04|
+----------+----------+



In [2]:
# Register the DataFrame as a SQL view
df.createOrReplaceTempView("orders")

# SQL Query
result_sql = spark.sql("""
    WITH DailyOrders AS (
        SELECT 
            CustomerID, 
            COUNT(DISTINCT OrderDate) AS OrderDays
        FROM orders
        WHERE MONTH(OrderDate) = 11 AND YEAR(OrderDate) = 2024
        GROUP BY CustomerID
    ),
    TotalDays AS (
        SELECT COUNT(DISTINCT OrderDate) AS TotalDays
        FROM orders
        WHERE MONTH(OrderDate) = 11 AND YEAR(OrderDate) = 2024
    )
    SELECT 
        d.CustomerID
    FROM DailyOrders d
    CROSS JOIN TotalDays t
    WHERE d.OrderDays = t.TotalDays
""")

result_sql.show()

+----------+
|CustomerID|
+----------+
+----------+



In [3]:
from pyspark.sql.functions import col, countDistinct, lit

# Extract month and year for filtering
filtered_df = df.filter((col("OrderDate").substr(1, 7) == lit("2024-11")))

# Count unique dates for each customer
customer_days = filtered_df.groupBy("CustomerID").agg(countDistinct("OrderDate").alias("OrderDays"))

# Count total unique days in the month
total_days = filtered_df.select(countDistinct("OrderDate").alias("TotalDays")).collect()[0]["TotalDays"]

# Filter customers who ordered on all days
result_df = customer_days.filter(col("OrderDays") == total_days)

result_df.show()


+----------+---------+
|CustomerID|OrderDays|
+----------+---------+
+----------+---------+



In [4]:
# Collect data
data = filtered_df.collect()

# Get unique days in the month
unique_days = set([row["OrderDate"] for row in data])

# Group orders by CustomerID
from collections import defaultdict

customer_orders = defaultdict(set)
for row in data:
    customer_orders[row["CustomerID"]].add(row["OrderDate"])

# Find customers who ordered every day
customers_everyday = [
    customer for customer, days in customer_orders.items()
    if days == unique_days
]

print(customers_everyday)


[]


In [5]:
from pyspark.sql.window import Window
from pyspark.sql.functions import countDistinct

# Define a window over CustomerID
windowSpec = Window.partitionBy("CustomerID")

# Count distinct days for each customer
df_with_days = filtered_df.withColumn("OrderDays", countDistinct("OrderDate").over(windowSpec))

# Count total days in the month
total_days = filtered_df.select(countDistinct("OrderDate").alias("TotalDays")).collect()[0]["TotalDays"]

# Filter customers
result_df = df_with_days.filter(col("OrderDays") == total_days).select("CustomerID").distinct()

result_df.show()


AnalysisException: Distinct window functions are not supported: count(distinct OrderDate#1) windowspecdefinition(CustomerID#0L, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())).;
Project [CustomerID#0L, OrderDate#1, OrderDays#71L]
+- Project [CustomerID#0L, OrderDate#1, OrderDays#71L, OrderDays#71L]
   +- Window [count(distinct OrderDate#1) windowspecdefinition(CustomerID#0L, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS OrderDays#71L], [CustomerID#0L]
      +- Project [CustomerID#0L, OrderDate#1]
         +- Filter (substring(OrderDate#1, 1, 7) = 2024-11)
            +- LogicalRDD [CustomerID#0L, OrderDate#1], false
