# ****# DAY 3 (11/01/26) â€“ PySpark Transformations Deep Dive.**
# # 
# tasks:
# 1. Load full e-commerce dataset
# 2. Perform complex joins
# 3. Calculate running totals with window functions
# **4. Create derived features****

### 1. Defining the dataframe

In [0]:
df = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True) #read data into df


## 2. Complex Joins...

In [0]:
# Drop 'brand' from products to avoid duplicate columns
products = df.select("product_id", "category_id", "category_code").dropDuplicates()

#user dimensions
users = df.select("user_id").dropDuplicates()

In [0]:
#Join facts with dimensions..
e = df.alias("e")
p = products.alias("p")
u = users.alias("u")

# Drop duplicate columns from users and products before join
events_enriched = (
    e.join(p, e.product_id == p.product_id, "left")
     .join(u, e.user_id == u.user_id, "left")
     .drop(u.user_id)
)

'''Now events_enriched is our star-schema style fact table.'''


'Now events_enriched is our star-schema style fact table.'

In [0]:
events_enriched \
    .filter("e.user_id IS NOT NULL") \
    .orderBy("e.user_id", "event_time") \
    .select("e.user_id", "event_time", "price") \
    .show(20, truncate=False)


+---------+-------------------+-------+
|user_id  |event_time         |price  |
+---------+-------------------+-------+
|33869381 |2019-10-23 20:04:08|769.65 |
|64078358 |2019-10-13 00:13:46|0.0    |
|183503497|2019-10-02 21:43:00|15.77  |
|184265397|2019-10-04 17:44:37|143.89 |
|184265397|2019-10-04 17:45:18|143.89 |
|184265397|2019-10-04 17:50:50|111.46 |
|184265397|2019-10-04 17:51:04|111.46 |
|184265397|2019-10-15 17:18:59|79.77  |
|184265397|2019-10-15 17:19:28|79.77  |
|195082191|2019-10-10 03:35:36|161.88 |
|200673532|2019-10-10 15:02:36|99.42  |
|200673532|2019-10-12 07:58:37|99.42  |
|200673532|2019-10-12 14:47:07|73.36  |
|200673532|2019-10-13 14:26:21|73.36  |
|205053188|2019-10-09 10:30:19|162.17 |
|205053188|2019-10-09 10:30:44|162.17 |
|208669541|2019-10-04 05:49:14|18.47  |
|208669541|2019-10-04 05:49:46|8.75   |
|209714031|2019-10-20 18:29:45|1686.02|
|209714031|2019-10-20 18:30:08|75.16  |
+---------+-------------------+-------+
only showing top 20 rows


## 3. Running totals using Window Functions using window functions...

In [0]:
events_enriched.select(
    "e.event_time",
    "e.user_id",
    "e.product_id",
    "brand",
    "p.category_code",
    "e.price"
).show(10, truncate=False)


+-------------------+---------+----------+--------+-----------------------------------+------+
|event_time         |user_id  |product_id|brand   |category_code                      |price |
+-------------------+---------+----------+--------+-----------------------------------+------+
|2019-10-01 00:00:10|520571932|28719074  |baden   |apparel.shoes.keds                 |102.71|
|2019-10-01 00:00:11|537918940|1004545   |huawei  |electronics.smartphone             |566.01|
|2019-10-01 00:00:08|550978835|31500053  |luminarc|NULL                               |41.16 |
|2019-10-01 00:00:05|512742880|1480613   |pulser  |computers.desktop                  |908.62|
|2019-10-01 00:00:00|554748717|3900821   |aqua    |appliances.environment.water_heater|33.2  |
|2019-10-01 00:00:01|550050854|1307067   |lenovo  |computers.notebook                 |251.74|
|2019-10-01 00:00:01|519107250|17200506  |NULL    |furniture.living_room.sofa         |543.1 |
|2019-10-01 00:00:00|541312140|44600062  |shiseido

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col

w = Window.partitionBy(col("e.user_id")).orderBy(col("e.event_time"))

events_enriched = events_enriched.withColumn(
    "running_user_spend",
    sum(col("e.price")).over(w)
)

events_enriched.select(
    "e.user_id",
    "e.event_time",
    "e.price",
    "running_user_spend"
).show(20, truncate=False)


+---------+-------------------+-------+------------------+
|user_id  |event_time         |price  |running_user_spend|
+---------+-------------------+-------+------------------+
|205053188|2019-10-09 10:30:19|162.17 |162.17            |
|205053188|2019-10-09 10:30:44|162.17 |324.34            |
|209714031|2019-10-20 18:29:45|1686.02|1686.02           |
|209714031|2019-10-20 18:30:08|75.16  |1761.18           |
|209714031|2019-10-24 18:18:25|1686.02|3447.2            |
|209714031|2019-10-24 18:27:59|398.98 |3846.18           |
|209714031|2019-10-24 18:29:31|398.98 |4245.16           |
|209714031|2019-10-25 06:59:08|398.98 |4644.139999999999 |
|209714031|2019-10-29 19:39:37|61.75  |4705.889999999999 |
|209714031|2019-10-29 19:41:10|97.79  |4803.679999999999 |
|209714031|2019-10-29 19:42:34|16.73  |4820.409999999999 |
|209714031|2019-10-29 19:42:57|13.61  |4834.019999999999 |
|209714031|2019-10-29 19:45:06|35.78  |4869.799999999998 |
|209714031|2019-10-29 19:45:44|35.78  |4905.579999999998

## 4. Create Derived Features..

### A. Session total spend

In [0]:

from pyspark.sql.functions import sum

session_spend = events_enriched.groupBy("user_session").agg(sum("price").alias("session_total_spend"))


#Joining back:

events_enriched = events_enriched.join(session_spend, "user_session", "left")

### B. User total lifetime spend

In [0]:
user_spend = events_enriched.groupBy("user_id") \
    .agg(sum("price").alias("user_lifetime_spend"))

events_enriched = events_enriched.join(user_spend, "user_id", "left")


### C. Is this a purchase?

In [0]:
from pyspark.sql.functions import when

events_enriched = events_enriched.withColumn(
    "is_purchase",
    when(events_enriched["e.event_type"] == "purchase", 1).otherwise(0)
)


### D. High-value user flag

In [0]:
events_enriched = events_enriched.withColumn(
    "high_value_user",
    when(events_enriched.user_lifetime_spend > 1000, 1).otherwise(0)
)


In [0]:
#checking dervied features...

events_enriched.select(
    "user_id",
    "user_session",
    "price",
    "session_total_spend",
    "user_lifetime_spend",
    "is_purchase",
    "high_value_user"
).show(20, truncate=False)


+---------+------------------------------------+-------+-------------------+-------------------+-----------+---------------+
|user_id  |user_session                        |price  |session_total_spend|user_lifetime_spend|is_purchase|high_value_user|
+---------+------------------------------------+-------+-------------------+-------------------+-----------+---------------+
|550663441|b69f9a29-3ffa-44c4-8ad8-911082f2d575|39.64  |217.72             |7257.230000000001  |0          |1              |
|512613212|ec9f3269-1e71-4460-9342-d279bf76dd4a|168.34 |336.68             |12741.139999999998 |0          |1              |
|518086063|016a801d-f91d-4efb-b807-8ab84f8719e6|214.16 |214.16             |74242.09000000019  |0          |1              |
|518792326|115fd5a4-6d0e-49aa-926c-6d65f09db426|1027.05|4005.24            |86470.07999999999  |0          |1              |
|523050704|c76831a9-684e-452a-a18d-2767b5ddd6b0|286.84 |2647.9399999999996 |19397.479999999996 |0          |1              |
