In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master('local[*]') \
        .appName('Basics') \
        .getOrCreate()

print(spark.version)

3.5.4


In [2]:
df_2018 = spark.read.csv(f"C:/Users/gyalm/OneDrive/Desktop/churn_prediction/churn_prediction/data/week2_data_for_student_lab.csv.gz", inferSchema=True, header=True)
df_2018.show(5)
df_2018.createOrReplaceTempView("df_2018_view")

+----------------+-----------+--------------------+------+-------+--------+------+-----+---------+-----------+------+-------------+
|transaction_date|member_type|           member_id|gender|product|quantity|amount|  age|card_type|branch_name|region|category_name|
+----------------+-----------+--------------------+------+-------+--------+------+-----+---------+-----------+------+-------------+
|      2018-04-01|     member|ff96777d-cfda-11e...|FEMALE|   3267|      16|  5100|30-35|  REGULAR|    E_Store|     3|  FROZEN FOOD|
|      2018-04-09|     member|00d89024-cfdb-11e...|FEMALE|   6748|       1|  1100|55-60|  REGULAR|    A_Store|     1|          PET|
|      2018-04-01|     member|00858b3b-cfdb-11e...|FEMALE|    420|       4|  5300|55-60|  REGULAR|    A_Store|     1|          PET|
|      2018-04-13|     member|001035ed-cfdb-11e...|FEMALE|   3178|      16|  5600|45-50|  REGULAR|    A_Store|     1|      GADGETS|
|      2018-04-15|     member|ff5fafeb-cfda-11e...|FEMALE|   2727|      19| 

In [3]:
df_2018.columns

['transaction_date',
 'member_type',
 'member_id',
 'gender',
 'product',
 'quantity',
 'amount',
 'age',
 'card_type',
 'branch_name',
 'region',
 'category_name']

In [4]:
spark.sql('''SELECT COUNT(DISTINCT category_name) FROM df_2018_view''').show()

+-----------------------------+
|count(DISTINCT category_name)|
+-----------------------------+
|                           10|
+-----------------------------+



In [5]:
spark.sql('''SELECT DISTINCT category_name FROM df_2018_view ORDER BY category_name ASC''').show()

+-------------+
|category_name|
+-------------+
|  ACCESSORIES|
|   APPLIANCES|
|     CLEANERS|
|      CLOTHES|
|         FOOD|
|  FROZEN FOOD|
|      GADGETS|
|     HARDWARE|
|          PET|
|        SHOES|
+-------------+



In [6]:
spark.sql('''
WITH customers_in_april AS (
  SELECT DISTINCT member_id
  FROM df_2018_view
  WHERE transaction_date BETWEEN '2018-04-01' AND '2018-04-30'
), 
customers_in_april_joined_with_past_3_months_txn AS (
  SELECT
    a.member_id, 
    b.quantity, 
    b.transaction_date
  FROM
    customers_in_april AS a
  LEFT JOIN
    df_2018_view AS b
  ON
    a.member_id = b.member_id
  WHERE
    b.transaction_date BETWEEN '2018-01-01' AND '2018-03-31'
), 
aggregated_total_txns_in_april AS (
  SELECT
    member_id, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-01-31' THEN quantity ELSE 0 END) AS pm3_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-02-01' AND '2018-02-28' THEN quantity ELSE 0 END) AS pm2_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-03-01' AND '2018-03-31' THEN quantity ELSE 0 END) AS pm1_total_txn,
    '2018-04-01' AS PARTITION_MONTH
  FROM
    customers_in_april_joined_with_past_3_months_txn
  GROUP BY
    member_id
),
customers_in_may AS (
  SELECT DISTINCT member_id
  FROM df_2018_view
  WHERE transaction_date BETWEEN '2018-05-01' AND '2018-05-31'
), 
customers_in_may_joined_with_past_3_months_txn AS (
  SELECT
    a.member_id, 
    b.quantity, 
    b.transaction_date
  FROM
    customers_in_may AS a
  LEFT JOIN
    df_2018_view AS b
  ON
    a.member_id = b.member_id
  WHERE
    b.transaction_date BETWEEN '2018-02-01' AND '2018-04-30'
), 
aggregated_total_txns_in_may AS (
  SELECT
    member_id, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-02-01' AND '2018-02-28' THEN quantity ELSE 0 END) AS pm3_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-03-01' AND '2018-03-31' THEN quantity ELSE 0 END) AS pm2_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-04-01' AND '2018-04-30' THEN quantity ELSE 0 END) AS pm1_total_txn,
    '2018-05-01' AS PARTITION_MONTH
  FROM
    customers_in_may_joined_with_past_3_months_txn
  GROUP BY
    member_id
),
customers_in_june AS (
  SELECT DISTINCT member_id
  FROM df_2018_view
  WHERE transaction_date BETWEEN '2018-06-01' AND '2018-06-30'
), 
customers_in_june_joined_with_past_3_months_txn AS (
  SELECT
    a.member_id, 
    b.quantity, 
    b.transaction_date
  FROM
    customers_in_june AS a
  LEFT JOIN
    df_2018_view AS b
  ON
    a.member_id = b.member_id
  WHERE
    b.transaction_date BETWEEN '2018-03-01' AND '2018-05-31'
), 
aggregated_total_txns_in_june AS (
  SELECT
    member_id, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-03-01' AND '2018-03-31' THEN quantity ELSE 0 END) AS pm3_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-04-01' AND '2018-04-30' THEN quantity ELSE 0 END) AS pm2_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-05-01' AND '2018-05-31' THEN quantity ELSE 0 END) AS pm1_total_txn,
    '2018-06-01' AS PARTITION_MONTH
  FROM
    customers_in_june_joined_with_past_3_months_txn
  GROUP BY
    member_id
),
union_all_aggregated_txns AS (
  SELECT * FROM aggregated_total_txns_in_april
  UNION ALL
  SELECT * FROM aggregated_total_txns_in_may
  UNION ALL
  SELECT * FROM aggregated_total_txns_in_june
)
SELECT
    PARTITION_MONTH,
    COUNT(member_id) AS total_member_id
FROM
    union_all_aggregated_txns
GROUP BY
    PARTITION_MONTH
ORDER BY
    PARTITION_MONTH
''').show()

+---------------+---------------+
|PARTITION_MONTH|total_member_id|
+---------------+---------------+
|     2018-04-01|          59143|
|     2018-05-01|          84325|
|     2018-06-01|          89658|
+---------------+---------------+



In [8]:
spark.sql('''
WITH
customers_in_july AS (
SELECT
  DISTINCT member_id
FROM
  df_2018_view
WHERE
  transaction_date BETWEEN '2018-07-01' AND '2018-07-31'
)
,customers_in_july_joined_with_past_transactions AS (
SELECT
  a.member_id
  ,b.quantity
  ,b.transaction_date
FROM
  customers_in_july AS a
LEFT JOIN
  df_2018_view AS b
ON
  a.member_id = b.member_id
AND
  b.transaction_date BETWEEN '2018-01-01' AND '2018-06-30'
)
,aggregated_total_txns_per_month AS (
SELECT
   member_id
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-04-01' AND '2018-04-30' THEN quantity ELSE 0 END) AS pm3_total_txn
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-05-01' AND '2018-05-31' THEN quantity ELSE 0 END) AS pm2_total_txn
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-06-01' AND '2018-06-30' THEN quantity ELSE 0 END) AS pm1_total_txn
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-07-01' AND '2018-07-31' THEN quantity ELSE 0 END) AS pm0_total_txn
  ,'2018-07-01' AS PARTITION_MONTH
FROM
  customers_in_july_joined_with_past_transactions
GROUP BY
  member_id
)
,customers_with_target_definition AS (
SELECT
  *
  ,CASE
    WHEN pm1_total_txn = 0 AND pm2_total_txn = 0 AND pm3_total_txn = 0 THEN 1
    ELSE 0
    END AS target
FROM
  aggregated_total_txns_per_month
)
,customers_joined_with_features AS (
SELECT
  a.member_id
  ,b.amount
  ,b.category_name
  ,b.transaction_date
FROM
  customers_in_july AS a
LEFT JOIN
  df_2018_view AS b
ON
  a.member_id = b.member_id
AND
  b.transaction_date BETWEEN '2018-01-01' AND '2018-03-31'
)
,aggregated_categories AS (
SELECT
   member_id

  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('ACCESSORIES') THEN amount ELSE 0 END) AS pm_accessories
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('APPLIANCES') THEN amount ELSE 0 END) AS pm_appliances
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('CLEANERS') THEN amount ELSE 0 END) AS pm_cleaners

  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('CLOTHES') THEN amount ELSE 0 END) AS pm_clothes
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('FOOD') THEN amount ELSE 0 END) AS pm_food
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('FROZEN FOOD') THEN amount ELSE 0 END) AS pm_frozen_food

  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('GADGETS') THEN amount ELSE 0 END) AS pm_gadgets
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('HARDWARE') THEN amount ELSE 0 END) AS pm_hardware
  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('PET') THEN amount ELSE 0 END) AS pm_pet

  ,SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-03-31' AND category_name IN ('SHOES') THEN amount ELSE 0 END) AS pm_shoes

FROM
  customers_joined_with_features
GROUP BY
  member_id
)
,customers_with_target_joined_with_features AS (
SELECT
  a.member_id
  ,a.target
  ,b.pm_accessories
  ,b.pm_appliances
  ,b.pm_cleaners
  ,b.pm_clothes
  ,b.pm_food
  ,b.pm_frozen_food
  ,b.pm_gadgets
  ,b.pm_hardware
  ,b.pm_pet
  ,b.pm_shoes
FROM
  customers_with_target_definition AS a
LEFT JOIN
  aggregated_categories AS b
ON
  a.member_id = b.member_id
)

,count_customers_with_target_definition AS (
SELECT
  target
  ,COUNT(member_id) AS cnt_member_id
  ,COUNT(DISTINCT member_id) AS cntd_member_id
FROM
  customers_with_target_definition
GROUP BY
  target
)

SELECT
  *
FROM
  count_customers_with_target_definition
''').show()

+------+-------------+--------------+
|target|cnt_member_id|cntd_member_id|
+------+-------------+--------------+
|     1|        15873|         15873|
|     0|        34794|         34794|
+------+-------------+--------------+



In [9]:
(34794/(15832+34794))*100

68.72753130802354

Analysis of Week 3: Data Engineering

With the goal of identifying customers that churned based on past three months with no transactions, a comparison between the performance and behavorial window was postulated. The performance window is inclusive of the months of April to June with the target of viewing the net churn rate up until customers in July - thus, the target selection consists of the member_id, quantity, and transaction_date. The behavorial window subsist of January to March, including the feature selection of member_id, category_name, amount, and transaction_date, in order to balance the distribution of data.

Data selection begins with `customers_in_july`, filtering distinct `member_id` values who transacted in July 2018. Data cleaning occurs in `customers_in_july_joined_with_past_transactions`, where a `LEFT JOIN` links July customers with their past transactions from January to June, ensuring historical/trend continuity. Feature selection is implemented in `customers_joined_with_features`, extracting `member_id`, `amount`, `category_name`, and `transaction_date` from January to March for behavioral analysis. Class balancing is introduced in `customers_with_target_definition`, where churn (`target=1`) is assigned to customers with zero transactions in April, May, and June. Feature engineering occurs in `aggregated_categories`, summing transaction amounts per category for January–March, to enrich feature selection in the performance window profile. Data augmentation happens in `customers_with_target_joined_with_features`, merging churn labels with the behavorial window of expenditure.Lastly, data standardization is applied in `count_customers_with_target_definition`, aggregating customer counts per churn status.

To calculate the overall churn rate of 68.72%, counted customers of those that have had zero transactions aggregated for the past 3 months of the performance window (`target=0`) were divided by the sum of customers that have churned in the past 3 months (`target=1`) and customers that have had zero transactions in the past 3 months, multiplied by a 100 to get the percentage value. Overall, a churn rate of 68.72% indicates a significant portion of the customer base has become inactive over the past three months, suggesting potential issues in customer engagement, satisfaction, or retention strategies.