In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# generate missing date records

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DateType
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("MissingDateRecords").getOrCreate()

# Sample Data
data = [
    (101, 1, 1001, "2024-10-01"),
    (102, 1, 1002, "2024-10-03"),
    (103, 2, 1003, "2024-10-04"),
    (104, 2, 1004, "2024-10-06"),
    # New Records
    (105, 3, 1005, "2024-10-08"),
    (106, 3, 1006, "2024-10-09"),
    (107, 4, 1007, "2024-10-03"),
    (108, 4, 1008, "2024-10-10"),
    (109, 5, 1009, "2024-10-04"),
    (110, 6, 1010, "2024-10-06"),
    (111, 1, 1011, "2024-10-01"),
    
    (114, 4, 1014, "2024-10-07")
]

# Create DataFrame
df = spark.createDataFrame(data, ["orderID", "customerID", "productID", "orderdate"])

# Convert to date format
df = df.withColumn("orderdate", col("orderdate").cast(DateType()))

# Create Temporary View for Spark SQL
df.createOrReplaceTempView("orders")
df.cache()
df.show()


+-------+----------+---------+----------+
|orderID|customerID|productID| orderdate|
+-------+----------+---------+----------+
|    101|         1|     1001|2024-10-01|
|    102|         1|     1002|2024-10-03|
|    103|         2|     1003|2024-10-04|
|    104|         2|     1004|2024-10-06|
|    105|         3|     1005|2024-10-08|
|    106|         3|     1006|2024-10-09|
|    107|         4|     1007|2024-10-03|
|    108|         4|     1008|2024-10-10|
|    109|         5|     1009|2024-10-04|
|    110|         6|     1010|2024-10-06|
|    111|         1|     1011|2024-10-01|
|    114|         4|     1014|2024-10-07|
+-------+----------+---------+----------+



In [7]:
res = spark.sql(""" 
  
  WITH date_range AS (
    SELECT explode(sequence(to_date('2024-10-01'), to_date('2024-10-10'), interval 1 day)) AS orderdate
),
full_data AS (
    SELECT dr.orderdate, o.orderID, o.customerID, o.productID
    FROM date_range dr
    LEFT JOIN orders o ON dr.orderdate = o.orderdate
)
SELECT * FROM full_data ORDER BY orderdate;
              
                
                """)

res.show()

+----------+-------+----------+---------+
| orderdate|orderID|customerID|productID|
+----------+-------+----------+---------+
|2024-10-01|    111|         1|     1011|
|2024-10-01|    101|         1|     1001|
|2024-10-02|   null|      null|     null|
|2024-10-03|    107|         4|     1007|
|2024-10-03|    102|         1|     1002|
|2024-10-04|    109|         5|     1009|
|2024-10-04|    103|         2|     1003|
|2024-10-05|   null|      null|     null|
|2024-10-06|    110|         6|     1010|
|2024-10-06|    104|         2|     1004|
|2024-10-07|    114|         4|     1014|
|2024-10-08|    105|         3|     1005|
|2024-10-09|    106|         3|     1006|
|2024-10-10|    108|         4|     1008|
+----------+-------+----------+---------+



In [8]:
# Generate date range DataFrame
date_range_df = spark.sql("SELECT explode(sequence(to_date('2024-10-01'), to_date('2024-10-10'), interval 1 day)) AS orderdate")

# Join original data with date range DataFrame
full_data_df = date_range_df.join(df, "orderdate", "left")

# Show the result
full_data_df.orderBy("orderdate").show()


+----------+-------+----------+---------+
| orderdate|orderID|customerID|productID|
+----------+-------+----------+---------+
|2024-10-01|    111|         1|     1011|
|2024-10-01|    101|         1|     1001|
|2024-10-02|   null|      null|     null|
|2024-10-03|    107|         4|     1007|
|2024-10-03|    102|         1|     1002|
|2024-10-04|    109|         5|     1009|
|2024-10-04|    103|         2|     1003|
|2024-10-05|   null|      null|     null|
|2024-10-06|    110|         6|     1010|
|2024-10-06|    104|         2|     1004|
|2024-10-07|    114|         4|     1014|
|2024-10-08|    105|         3|     1005|
|2024-10-09|    106|         3|     1006|
|2024-10-10|    108|         4|     1008|
+----------+-------+----------+---------+

