In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# New, Repeat, Active, Lapsed Customers

### New Customer: First-time buyers within a period.
### Repeat Customer: Customers who have placed more than one order.
### Active Customer: Customers with orders within a recent period (e.g., last 30 days).
### Lapsed Customer: Customers who have not placed orders within a specified timeframe (e.g., more than 30 days ago).


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, min, max, datediff, current_date

# Initialize Spark session
spark = SparkSession.builder.appName("CustomerAnalysis").getOrCreate()

# Sample data
data = [
    ("2023-10-01", 1, "C001"),
    ("2023-10-10", 2, "C002"),
    ("2023-10-15", 3, "C001"),
    ("2023-10-20", 4, "C003"),
    ("2023-10-25", 5, "C001"),
    ("2023-10-30", 6, "C004"),
    ("2023-09-01", 7, "C002"),
    ("2023-08-15", 8, "C003"),
    ("2023-07-10", 9, "C001"),
    ("2023-06-05", 10, "C002")
]

# Create DataFrame
columns = ["orderDate", "OrderKey", "customerID"]
df = spark.createDataFrame(data, columns)
df = df.withColumn("orderDate", col("orderDate").cast("date"))
df.createOrReplaceTempView("orders")

df.show()


+----------+--------+----------+
| orderDate|OrderKey|customerID|
+----------+--------+----------+
|2023-10-01|       1|      C001|
|2023-10-10|       2|      C002|
|2023-10-15|       3|      C001|
|2023-10-20|       4|      C003|
|2023-10-25|       5|      C001|
|2023-10-30|       6|      C004|
|2023-09-01|       7|      C002|
|2023-08-15|       8|      C003|
|2023-07-10|       9|      C001|
|2023-06-05|      10|      C002|
+----------+--------+----------+



##### First, identify New and Repeat customers by counting the number of unique orders per customer

In [5]:
from pyspark.sql.window import Window
from pyspark.sql.functions import when, lit

# Calculate order count per customer and minimum order date
customer_orders = df.groupBy("customerID").agg(
    count("OrderKey").alias("order_count"),
    min("orderDate").alias("first_order_date")
)

# Label New and Repeat customers
customer_orders = customer_orders.withColumn(
    "customer_type",
    when(col("order_count") == 1, "New").otherwise("Repeat")
)


##### Define customers as Active or Lapsed based on recent activity. Let's assume a customer is considered active if they placed an order within the last 30 days.

In [6]:
# Find the most recent order date per customer
recent_orders = df.groupBy("customerID").agg(
    max("orderDate").alias("last_order_date")
)

# Calculate days since last order and categorize as Active or Lapsed
recent_orders = recent_orders.withColumn(
    "customer_status",
    when(datediff(current_date(), col("last_order_date")) <= 30, "Active").otherwise("Lapsed")
)

# Join to get all customer information
final_df = customer_orders.join(recent_orders, on="customerID", how="inner")
final_df.show()


+----------+-----------+----------------+-------------+---------------+---------------+
|customerID|order_count|first_order_date|customer_type|last_order_date|customer_status|
+----------+-----------+----------------+-------------+---------------+---------------+
|      C001|          4|      2023-07-10|       Repeat|     2023-10-25|         Lapsed|
|      C002|          3|      2023-06-05|       Repeat|     2023-10-10|         Lapsed|
|      C003|          2|      2023-08-15|       Repeat|     2023-10-20|         Lapsed|
|      C004|          1|      2023-10-30|          New|     2023-10-30|         Lapsed|
+----------+-----------+----------------+-------------+---------------+---------------+



In [8]:
##### customer_orders CTE: Calculates the number of orders (order_count), the date of the first order (first_order_date), and the most recent order (last_order_date) for each customerID.

###### Main Query:

# customer_type: Categorizes each customer as New (only 1 order) or Repeat (more than 1 order).
#customer_status: Categorizes each customer as Active (order within last 30 days) or Lapsed (no order in the last 30 days).


res = spark.sql("""
WITH customer_orders AS (
    SELECT customerID,
           COUNT(OrderKey) AS order_count,
           MIN(orderDate) AS first_order_date,
           MAX(orderDate) AS last_order_date
    FROM orders
    GROUP BY customerID
)

SELECT customerID,
       CASE WHEN order_count = 1 THEN 'New' ELSE 'Repeat' END AS customer_type,
       CASE WHEN DATEDIFF(current_date(), last_order_date) <= 30 THEN 'Active' ELSE 'Lapsed' END AS customer_status
FROM customer_orders
""")

res.show()


+----------+-------------+---------------+
|customerID|customer_type|customer_status|
+----------+-------------+---------------+
|      C001|       Repeat|         Lapsed|
|      C002|       Repeat|         Lapsed|
|      C003|       Repeat|         Lapsed|
|      C004|          New|         Lapsed|
+----------+-------------+---------------+

