In [1]:
import json
import pandas as pd
import os
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
# Create a Spark session (mini-cluster on your Mac)
spark = SparkSession.builder \
    .appName("CustomerChurn") \
    .master("local[*]") \
    .getOrCreate()

# Read your large JSON
data = spark.read.json("../data/customer churn data.json")

# Peek at schema and rows
data.printSchema()
data.show(5)
print(data.count(), "rows")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/12 14:27:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/12 14:27:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

+--------------------+---------+---------+------+-------------+--------+----------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+-------+
|              artist|     auth|firstName|gender|itemInSession|lastName|    length|



26259199 rows


                                                                                

In [None]:
# 1️⃣ Create churn_users (users who canceled)
churn_users = (
    df.filter(F.col("page") == "Cancellation Confirmation")
      .select("userId")
      .distinct()
)

# 2️⃣ Join back to main df to add churn_flag
df = (
    df.join(churn_users.withColumn("churn_flag", F.lit(1)), on="userId", how="left")
      .fillna({"churn_flag": 0})
)

# 3️⃣ Check churn counts per user
df.select("userId", "churn_flag").distinct().groupBy("churn_flag").count().show()



+----------+-----+
|churn_flag|count|
+----------+-----+
|         1| 5003|
|         0|17275|
+----------+-----+



                                                                                

In [7]:
df.filter(F.col("page") == "Cancellation Confirmation") \
  .select("userId", "page", "ts") \
  .show(10, truncate=False)

+-------+-------------------------+-------------+
|userId |page                     |ts           |
+-------+-------------------------+-------------+
|1768454|Cancellation Confirmation|1538360145000|
|1381915|Cancellation Confirmation|1538362976000|
|1298443|Cancellation Confirmation|1538366008000|
|1379352|Cancellation Confirmation|1538371144000|
|1068112|Cancellation Confirmation|1538371567000|
|1362621|Cancellation Confirmation|1538372613000|
|1276848|Cancellation Confirmation|1538376242000|
|1277353|Cancellation Confirmation|1538376254000|
|1274097|Cancellation Confirmation|1538378849000|
|1163202|Cancellation Confirmation|1538379172000|
+-------+-------------------------+-------------+
only showing top 10 rows


In [None]:


# Get each user's latest timestamp
user_last_event = (
    df.groupBy("userId")
      .agg(F.max("ts").alias("last_ts"))
)

# Compare with cancellation events
check_after_cancel = (
    df.join(user_last_event, on="userId")
      .filter((F.col("page") == "Cancellation Confirmation") & (F.col("ts") < F.col("last_ts")))
)

check_after_cancel.count()

                                                                                

0