In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master('local[*]') \
        .appName('Basics') \
        .getOrCreate()

print(spark.version)

3.5.4


In [3]:
df_2018 = spark.read.csv(f"C:/Users/gyalm/OneDrive/Desktop/churn_prediction/churn_prediction/data/week2_data_for_student_lab.csv.gz", inferSchema=True, header=True)
df_2018.show(5)

+----------------+-----------+--------------------+------+-------+--------+------+-----+---------+-----------+------+-------------+
|transaction_date|member_type|           member_id|gender|product|quantity|amount|  age|card_type|branch_name|region|category_name|
+----------------+-----------+--------------------+------+-------+--------+------+-----+---------+-----------+------+-------------+
|      2018-04-01|     member|ff96777d-cfda-11e...|FEMALE|   3267|      16|  5100|30-35|  REGULAR|    E_Store|     3|  FROZEN FOOD|
|      2018-04-09|     member|00d89024-cfdb-11e...|FEMALE|   6748|       1|  1100|55-60|  REGULAR|    A_Store|     1|          PET|
|      2018-04-01|     member|00858b3b-cfdb-11e...|FEMALE|    420|       4|  5300|55-60|  REGULAR|    A_Store|     1|          PET|
|      2018-04-13|     member|001035ed-cfdb-11e...|FEMALE|   3178|      16|  5600|45-50|  REGULAR|    A_Store|     1|      GADGETS|
|      2018-04-15|     member|ff5fafeb-cfda-11e...|FEMALE|   2727|      19| 

In [4]:
df_2018.createOrReplaceTempView("df_2018_view")

In [5]:
spark.sql('''SELECT * FROM df_2018_view LIMIT 5''').show()

+----------------+-----------+--------------------+------+-------+--------+------+-----+---------+-----------+------+-------------+
|transaction_date|member_type|           member_id|gender|product|quantity|amount|  age|card_type|branch_name|region|category_name|
+----------------+-----------+--------------------+------+-------+--------+------+-----+---------+-----------+------+-------------+
|      2018-04-01|     member|ff96777d-cfda-11e...|FEMALE|   3267|      16|  5100|30-35|  REGULAR|    E_Store|     3|  FROZEN FOOD|
|      2018-04-09|     member|00d89024-cfdb-11e...|FEMALE|   6748|       1|  1100|55-60|  REGULAR|    A_Store|     1|          PET|
|      2018-04-01|     member|00858b3b-cfdb-11e...|FEMALE|    420|       4|  5300|55-60|  REGULAR|    A_Store|     1|          PET|
|      2018-04-13|     member|001035ed-cfdb-11e...|FEMALE|   3178|      16|  5600|45-50|  REGULAR|    A_Store|     1|      GADGETS|
|      2018-04-15|     member|ff5fafeb-cfda-11e...|FEMALE|   2727|      19| 

In [6]:
spark.sql('''SELECT COUNT(DISTINCT member_id) AS MemberId FROM df_2018_view''').show()

+--------+
|MemberId|
+--------+
|  356982|
+--------+



In [7]:
spark.sql('''SELECT COUNT(DISTINCT member_id) AS MemberId FROM df_2018_view WHERE transaction_date LIKE "%2018-04%"''').show()

+--------+
|MemberId|
+--------+
|  101637|
+--------+



In [8]:
spark.sql('''
WITH customers_in_april AS (
  SELECT DISTINCT member_id
  FROM df_2018_view
  WHERE transaction_date BETWEEN '2018-04-01' AND '2018-04-30'
), 
customers_in_april_joined_with_past_3_months_txn AS (
  SELECT
    a.member_id, 
    b.quantity, 
    b.transaction_date
  FROM
    customers_in_april AS a
  LEFT JOIN
    df_2018_view AS b
  ON
    a.member_id = b.member_id
  WHERE
    b.transaction_date BETWEEN '2018-01-01' AND '2018-03-31'
), 
aggregated_total_txns_in_april AS (
  SELECT
    member_id, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-01-01' AND '2018-01-31' THEN quantity ELSE 0 END) AS pm3_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-02-01' AND '2018-02-28' THEN quantity ELSE 0 END) AS pm2_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-03-01' AND '2018-03-31' THEN quantity ELSE 0 END) AS pm1_total_txn,
    '2018-04-01' AS PARTITION_MONTH
  FROM
    customers_in_april_joined_with_past_3_months_txn
  GROUP BY
    member_id
),
customers_in_may AS (
  SELECT DISTINCT member_id
  FROM df_2018_view
  WHERE transaction_date BETWEEN '2018-05-01' AND '2018-05-31'
), 
customers_in_may_joined_with_past_3_months_txn AS (
  SELECT
    a.member_id, 
    b.quantity, 
    b.transaction_date
  FROM
    customers_in_may AS a
  LEFT JOIN
    df_2018_view AS b
  ON
    a.member_id = b.member_id
  WHERE
    b.transaction_date BETWEEN '2018-02-01' AND '2018-04-30'
), 
aggregated_total_txns_in_may AS (
  SELECT
    member_id, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-02-01' AND '2018-02-28' THEN quantity ELSE 0 END) AS pm3_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-03-01' AND '2018-03-31' THEN quantity ELSE 0 END) AS pm2_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-04-01' AND '2018-04-30' THEN quantity ELSE 0 END) AS pm1_total_txn,
    '2018-05-01' AS PARTITION_MONTH
  FROM
    customers_in_may_joined_with_past_3_months_txn
  GROUP BY
    member_id
),
customers_in_june AS (
  SELECT DISTINCT member_id
  FROM df_2018_view
  WHERE transaction_date BETWEEN '2018-06-01' AND '2018-06-30'
), 
customers_in_june_joined_with_past_3_months_txn AS (
  SELECT
    a.member_id, 
    b.quantity, 
    b.transaction_date
  FROM
    customers_in_june AS a
  LEFT JOIN
    df_2018_view AS b
  ON
    a.member_id = b.member_id
  WHERE
    b.transaction_date BETWEEN '2018-03-01' AND '2018-05-31'
), 
aggregated_total_txns_in_june AS (
  SELECT
    member_id, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-03-01' AND '2018-03-31' THEN quantity ELSE 0 END) AS pm3_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-04-01' AND '2018-04-30' THEN quantity ELSE 0 END) AS pm2_total_txn, 
    SUM(CASE WHEN transaction_date BETWEEN '2018-05-01' AND '2018-05-31' THEN quantity ELSE 0 END) AS pm1_total_txn,
    '2018-06-01' AS PARTITION_MONTH
  FROM
    customers_in_june_joined_with_past_3_months_txn
  GROUP BY
    member_id
),
union_all_aggregated_txns AS (
  SELECT * FROM aggregated_total_txns_in_april
  UNION ALL
  SELECT * FROM aggregated_total_txns_in_may
  UNION ALL
  SELECT * FROM aggregated_total_txns_in_june
)
SELECT
    PARTITION_MONTH,
    COUNT(member_id) AS total_member_id
FROM
    union_all_aggregated_txns
GROUP BY
    PARTITION_MONTH
ORDER BY
    PARTITION_MONTH
''').show()

+---------------+---------------+
|PARTITION_MONTH|total_member_id|
+---------------+---------------+
|     2018-04-01|          59143|
|     2018-05-01|          84325|
|     2018-06-01|          89658|
+---------------+---------------+

